Rebuild extraction pipeline infrastructure (Faza 0 prep)

Implements the approved plan to replace the broken regex/index-master
extraction with an LLM-subagent pipeline. Four parallel lanes:

Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no
  max_pages truncation), normalize_sources.py, chunk_sources.py
  (~20pg chunks + overlap, manifest registry), activity_schema.json.
Lane B — app/config_taxonomy.py (16 fixed category slugs), schema
  rebuilt from scratch in app/models/ with content_type, language,
  source_files, source_excerpt, normalized_name, extraction_confidence,
  needs_review; FTS5 + 3 triggers extended with materials_list and
  skills_developed.
Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy
  source_excerpt validation, dedup with needs_review band),
  validate_extractions.py, review_queue.py, new run_extraction.py
  orchestrator, SUBAGENT_PROMPT.md.
Lane D — search.py content_type/language filters (default search
  excludes non-game content), E7 schema-compat audit; fixed a NULL
  keywords AttributeError in _boost_search_relevance.

Removes 8 orphaned/dead scripts and app/services/parser.py +
indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent).

Note: Lane D made one additive edit to app/models/database.py
(_update_category_counts) to surface content_type/language in
get_filter_options, outside its nominal lane boundary but after
Lane B completed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-05-19 17:43:38 +00:00
parent e0080edf85
commit 66ae831c36
37 changed files with 4101 additions and 1881 deletions

230
app/config_taxonomy.py Normal file
View File

@@ -0,0 +1,230 @@
"""
Controlled category taxonomy for game-library.
Single source of truth for activity categories. The DB stores the *slug*;
the UI displays the Romanian name. `category` (thematic domain) and
`content_type` (form of the content) are INDEPENDENT axes — see plan §2.
"""
import unicodedata
import re
from typing import Dict, List
# --- Categories (thematic domain) --------------------------------------------
# slug -> Romanian display name. ~16 fixed slugs; `altele` is the mandatory
# fallback and MUST always be present.
CATEGORIES: Dict[str, str] = {
"jocuri-cercetasesti": "Jocuri cercetășești",
"team-building": "Team-building",
"icebreakers": "Icebreakers / spargerea gheții",
"camp-outdoor": "Tabără și activități în aer liber",
"wide-games": "Wide games / jocuri de teren",
"orientare": "Orientare",
"prim-ajutor": "Prim ajutor",
"escape-room-puzzle": "Escape room și puzzle",
"creative-stem": "Creativitate și STEM",
"sports-active": "Sport și activități fizice",
"cantece-ceremonii": "Cântece și ceremonii",
"retete": "Rețete",
"supravietuire": "Supraviețuire",
"integrare-incluziune": "Integrare și incluziune",
"conflict-empatie": "Conflict și empatie",
"altele": "Altele",
}
# Mandatory fallback slug.
FALLBACK_CATEGORY = "altele"
# Ordered list of valid slugs.
CATEGORY_SLUGS: List[str] = list(CATEGORIES.keys())
# --- Content type (form of the content) --------------------------------------
# Independent axis from `category`. The UI default search excludes the
# non-game content types (see plan §6).
CONTENT_TYPES: Dict[str, str] = {
"joc": "Joc",
"activitate": "Activitate",
"reteta": "Rețetă",
"cantec": "Cântec",
"ceremonie": "Ceremonie",
}
CONTENT_TYPE_SLUGS: List[str] = list(CONTENT_TYPES.keys())
# Content types considered "non-game" — excluded from the default UI search.
NON_GAME_CONTENT_TYPES: List[str] = ["reteta", "cantec", "ceremonie"]
DEFAULT_CONTENT_TYPE = "activitate"
# --- Aliases -----------------------------------------------------------------
# Map of normalized arbitrary strings -> canonical slug. Keys are already
# diacritic-stripped, lowercased and hyphenated (see _slugify). This catches
# legacy / messy values from the old DB and common English/Romanian variants.
_CATEGORY_ALIASES: Dict[str, str] = {
# legacy junk
"general-activity": "altele",
"general": "altele",
"educational": "creative-stem",
"d": "altele",
"a": "altele",
"b": "altele",
"c": "altele",
# scouting
"cercetasie": "jocuri-cercetasesti",
"cercetasesti": "jocuri-cercetasesti",
"scout": "jocuri-cercetasesti",
"scouting": "jocuri-cercetasesti",
"scout-games": "jocuri-cercetasesti",
"jocuri-cercetasesti": "jocuri-cercetasesti",
# team building
"teambuilding": "team-building",
"team": "team-building",
"cooperare": "team-building",
# icebreakers
"icebreaker": "icebreakers",
"spargerea-ghetii": "icebreakers",
"cunoastere": "icebreakers",
"energizers": "icebreakers",
"energizer": "icebreakers",
# camp / outdoor
"camp": "camp-outdoor",
"tabara": "camp-outdoor",
"outdoor": "camp-outdoor",
"aer-liber": "camp-outdoor",
# wide games
"wide-game": "wide-games",
"jocuri-de-teren": "wide-games",
"joc-de-teren": "wide-games",
"big-games": "wide-games",
# orientare
"orienteering": "orientare",
"navigatie": "orientare",
# prim ajutor
"first-aid": "prim-ajutor",
"primul-ajutor": "prim-ajutor",
# escape room / puzzle
"escape-room": "escape-room-puzzle",
"escaperoom": "escape-room-puzzle",
"puzzle": "escape-room-puzzle",
"puzzles": "escape-room-puzzle",
"ghicitori": "escape-room-puzzle",
# creative / stem
"creative": "creative-stem",
"creativitate": "creative-stem",
"stem": "creative-stem",
"arts-and-crafts": "creative-stem",
"craft": "creative-stem",
"crafts": "creative-stem",
"stiinta": "creative-stem",
# sports
"sport": "sports-active",
"sports": "sports-active",
"sportive": "sports-active",
"active": "sports-active",
"miscare": "sports-active",
"physical": "sports-active",
# songs / ceremonies
"cantece": "cantece-ceremonii",
"cantec": "cantece-ceremonii",
"songs": "cantece-ceremonii",
"ceremonii": "cantece-ceremonii",
"ceremonie": "cantece-ceremonii",
"ceremony": "cantece-ceremonii",
# recipes
"reteta": "retete",
"recipe": "retete",
"recipes": "retete",
"cooking": "retete",
"gatit": "retete",
# survival
"survival": "supravietuire",
"supravietuire": "supravietuire",
# inclusion
"integrare": "integrare-incluziune",
"incluziune": "integrare-incluziune",
"inclusion": "integrare-incluziune",
# conflict / empathy
"conflict": "conflict-empatie",
"empatie": "conflict-empatie",
"empathy": "conflict-empatie",
"rezolvarea-conflictelor": "conflict-empatie",
# fallback
"altele": "altele",
"other": "altele",
"others": "altele",
"misc": "altele",
}
def _slugify(value: str) -> str:
"""Lowercase, strip diacritics, collapse non-alphanumerics to hyphens."""
if not value:
return ""
# Decompose accents (ă -> a, ș -> s, ț -> t, etc.)
decomposed = unicodedata.normalize("NFKD", value)
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
ascii_str = ascii_str.lower().strip()
ascii_str = re.sub(r"[^a-z0-9]+", "-", ascii_str)
return ascii_str.strip("-")
def normalize_category(value: str) -> str:
"""Map an arbitrary string to a valid category slug.
Returns one of CATEGORY_SLUGS, falling back to `altele` for anything
unrecognised or empty.
"""
if not value:
return FALLBACK_CATEGORY
slug = _slugify(str(value))
if not slug:
return FALLBACK_CATEGORY
# Exact slug match.
if slug in CATEGORIES:
return slug
# Alias match.
if slug in _CATEGORY_ALIASES:
return _CATEGORY_ALIASES[slug]
return FALLBACK_CATEGORY
def normalize_content_type(value: str) -> str:
"""Map an arbitrary string to a valid content_type slug.
Returns one of CONTENT_TYPE_SLUGS, falling back to `activitate`.
"""
if not value:
return DEFAULT_CONTENT_TYPE
slug = _slugify(str(value))
if slug in CONTENT_TYPES:
return slug
# Light alias handling for plural / English forms.
aliases = {
"jocuri": "joc",
"game": "joc",
"games": "joc",
"activitati": "activitate",
"activity": "activitate",
"retete": "reteta",
"recipe": "reteta",
"cantece": "cantec",
"song": "cantec",
"ceremonii": "ceremonie",
"ceremony": "ceremonie",
}
return aliases.get(slug, DEFAULT_CONTENT_TYPE)
def is_valid_category(slug: str) -> bool:
"""True if `slug` is a valid category slug."""
return slug in CATEGORIES
def category_display_name(slug: str) -> str:
"""Romanian display name for a slug (fallback to the slug itself)."""
return CATEGORIES.get(slug, slug)
def content_type_display_name(slug: str) -> str:
"""Romanian display name for a content_type slug."""
return CONTENT_TYPES.get(slug, slug)

View File

@@ -5,6 +5,22 @@ Activity data model for INDEX-SISTEM-JOCURI v2.0
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
import json
import re
import unicodedata
def normalize_name(name: str) -> str:
"""Diacritic-free, lowercased, whitespace-collapsed form of a name.
Used as the exact-match key for dedup grouping (see plan §4).
"""
if not name:
return ""
decomposed = unicodedata.normalize("NFKD", name)
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
ascii_str = ascii_str.lower().strip()
ascii_str = re.sub(r"\s+", " ", ascii_str)
return ascii_str
@dataclass
class Activity:
@@ -19,10 +35,19 @@ class Activity:
# Categories
category: str = ""
subcategory: Optional[str] = None
# content_type is an axis INDEPENDENT of category:
# one of joc/activitate/reteta/cantec/ceremonie (see config_taxonomy).
content_type: Optional[str] = None
# Source information
source_file: str = ""
page_reference: Optional[str] = None
# source_files: JSON-encoded list of every source the activity was seen in.
# `source_file` (singular) stays as the primary/original source; build_database
# (Lane C) accumulates the full list here on dedup-merge.
source_files: List[str] = field(default_factory=list)
# Short verbatim quote from the source — anti-hallucination anchor.
source_excerpt: Optional[str] = None
# Age and participants
age_group_min: Optional[int] = None
@@ -45,11 +70,22 @@ class Activity:
tags: List[str] = field(default_factory=list)
popularity_score: int = 0
# Extraction / language metadata
language: Optional[str] = None # 'ro' / 'en'
normalized_name: Optional[str] = None # dedup key; auto-derived from name
extraction_confidence: Optional[str] = None # 'high' / 'med' / 'low'
needs_review: int = 0
# Database fields
id: Optional[int] = None
created_at: Optional[str] = None
updated_at: Optional[str] = None
def __post_init__(self):
"""Derive normalized_name from name when not explicitly provided."""
if not self.normalized_name:
self.normalized_name = normalize_name(self.name)
def to_dict(self) -> Dict[str, Any]:
"""Convert activity to dictionary for database storage"""
return {
@@ -59,8 +95,11 @@ class Activity:
'variations': self.variations,
'category': self.category,
'subcategory': self.subcategory,
'content_type': self.content_type,
'source_file': self.source_file,
'source_files': json.dumps(self.source_files) if self.source_files else None,
'page_reference': self.page_reference,
'source_excerpt': self.source_excerpt,
'age_group_min': self.age_group_min,
'age_group_max': self.age_group_max,
'participants_min': self.participants_min,
@@ -73,7 +112,11 @@ class Activity:
'difficulty_level': self.difficulty_level,
'keywords': self.keywords,
'tags': json.dumps(self.tags) if self.tags else None,
'popularity_score': self.popularity_score
'popularity_score': self.popularity_score,
'language': self.language,
'normalized_name': self.normalized_name or normalize_name(self.name),
'extraction_confidence': self.extraction_confidence,
'needs_review': self.needs_review,
}
@classmethod
@@ -87,6 +130,16 @@ class Activity:
except (json.JSONDecodeError, TypeError):
tags = []
# source_files may arrive as a JSON string (DB) or a list (extraction)
source_files = data.get('source_files')
if isinstance(source_files, str):
try:
source_files = json.loads(source_files)
except (json.JSONDecodeError, TypeError):
source_files = []
elif source_files is None:
source_files = []
return cls(
id=data.get('id'),
name=data.get('name', ''),
@@ -95,8 +148,11 @@ class Activity:
variations=data.get('variations'),
category=data.get('category', ''),
subcategory=data.get('subcategory'),
content_type=data.get('content_type'),
source_file=data.get('source_file', ''),
source_files=source_files,
page_reference=data.get('page_reference'),
source_excerpt=data.get('source_excerpt'),
age_group_min=data.get('age_group_min'),
age_group_max=data.get('age_group_max'),
participants_min=data.get('participants_min'),
@@ -110,6 +166,10 @@ class Activity:
keywords=data.get('keywords'),
tags=tags,
popularity_score=data.get('popularity_score', 0),
language=data.get('language'),
normalized_name=data.get('normalized_name'),
extraction_confidence=data.get('extraction_confidence'),
needs_review=data.get('needs_review', 0) or 0,
created_at=data.get('created_at'),
updated_at=data.get('updated_at')
)

View File

@@ -30,6 +30,8 @@ class DatabaseManager:
"""Initialize database with v2.0 schema"""
with self._get_connection() as conn:
# Main activities table
# NOTE: schema is rebuilt from scratch (plan §6) — no in-place
# migration. The old DB is deleted and recreated by build_database.
conn.execute("""
CREATE TABLE IF NOT EXISTS activities (
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -39,8 +41,11 @@ class DatabaseManager:
variations TEXT,
category TEXT NOT NULL,
subcategory TEXT,
content_type TEXT,
source_file TEXT NOT NULL,
source_files TEXT,
page_reference TEXT,
source_excerpt TEXT,
-- Structured parameters
age_group_min INTEGER,
@@ -60,6 +65,13 @@ class DatabaseManager:
keywords TEXT,
tags TEXT,
popularity_score INTEGER DEFAULT 0,
-- Extraction / language metadata
language TEXT,
normalized_name TEXT,
extraction_confidence TEXT,
needs_review INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
@@ -69,6 +81,7 @@ class DatabaseManager:
conn.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS activities_fts USING fts5(
name, description, rules, variations, keywords,
materials_list, skills_developed,
content='activities',
content_rowid='id'
)
@@ -92,6 +105,7 @@ class DatabaseManager:
"CREATE INDEX IF NOT EXISTS idx_activities_age ON activities(age_group_min, age_group_max)",
"CREATE INDEX IF NOT EXISTS idx_activities_participants ON activities(participants_min, participants_max)",
"CREATE INDEX IF NOT EXISTS idx_activities_duration ON activities(duration_min, duration_max)",
"CREATE INDEX IF NOT EXISTS idx_activities_normalized_name ON activities(normalized_name)",
"CREATE INDEX IF NOT EXISTS idx_categories_type ON categories(type)"
]
@@ -102,24 +116,34 @@ class DatabaseManager:
conn.execute("""
CREATE TRIGGER IF NOT EXISTS activities_fts_insert AFTER INSERT ON activities
BEGIN
INSERT INTO activities_fts(rowid, name, description, rules, variations, keywords)
VALUES (new.id, new.name, new.description, new.rules, new.variations, new.keywords);
INSERT INTO activities_fts(rowid, name, description, rules, variations,
keywords, materials_list, skills_developed)
VALUES (new.id, new.name, new.description, new.rules, new.variations,
new.keywords, new.materials_list, new.skills_developed);
END
""")
conn.execute("""
CREATE TRIGGER IF NOT EXISTS activities_fts_delete AFTER DELETE ON activities
BEGIN
DELETE FROM activities_fts WHERE rowid = old.id;
INSERT INTO activities_fts(activities_fts, rowid, name, description, rules,
variations, keywords, materials_list, skills_developed)
VALUES ('delete', old.id, old.name, old.description, old.rules,
old.variations, old.keywords, old.materials_list, old.skills_developed);
END
""")
conn.execute("""
CREATE TRIGGER IF NOT EXISTS activities_fts_update AFTER UPDATE ON activities
BEGIN
DELETE FROM activities_fts WHERE rowid = old.id;
INSERT INTO activities_fts(rowid, name, description, rules, variations, keywords)
VALUES (new.id, new.name, new.description, new.rules, new.variations, new.keywords);
INSERT INTO activities_fts(activities_fts, rowid, name, description, rules,
variations, keywords, materials_list, skills_developed)
VALUES ('delete', old.id, old.name, old.description, old.rules,
old.variations, old.keywords, old.materials_list, old.skills_developed);
INSERT INTO activities_fts(rowid, name, description, rules, variations,
keywords, materials_list, skills_developed)
VALUES (new.id, new.name, new.description, new.rules, new.variations,
new.keywords, new.materials_list, new.skills_developed);
END
""")
@@ -179,6 +203,8 @@ class DatabaseManager:
"""Update category usage counts"""
categories_to_update = [
('category', activity.category),
('content_type', activity.content_type),
('language', activity.language),
('age_group', activity.get_age_range_display()),
('participants', activity.get_participants_display()),
('duration', activity.get_duration_display()),
@@ -332,8 +358,11 @@ class DatabaseManager:
def clear_database(self):
"""Clear all data from database"""
with self._get_connection() as conn:
# Deleting from activities fires the delete trigger, which removes
# the matching FTS rows. The explicit 'delete-all' command then
# guarantees the external-content FTS index is fully cleared.
conn.execute("DELETE FROM activities")
conn.execute("DELETE FROM activities_fts")
conn.execute("INSERT INTO activities_fts(activities_fts) VALUES('delete-all')")
conn.execute("DELETE FROM categories")
conn.commit()

View File

@@ -2,8 +2,6 @@
Services for INDEX-SISTEM-JOCURI v2.0
"""
from .parser import IndexMasterParser
from .indexer import ActivityIndexer
from .search import SearchService
__all__ = ['IndexMasterParser', 'ActivityIndexer', 'SearchService']
__all__ = ['SearchService']

View File

@@ -1,248 +0,0 @@
"""
Activity indexer service for INDEX-SISTEM-JOCURI v2.0
Coordinates parsing and database indexing
"""
from typing import List, Dict, Any
from pathlib import Path
from app.models.database import DatabaseManager
from app.models.activity import Activity
from app.services.parser import IndexMasterParser
import time
class ActivityIndexer:
"""Service for indexing activities from INDEX_MASTER into database"""
def __init__(self, db_manager: DatabaseManager, index_master_path: str):
"""Initialize indexer with database manager and INDEX_MASTER path"""
self.db = db_manager
self.parser = IndexMasterParser(index_master_path)
self.indexing_stats = {}
def index_all_activities(self, clear_existing: bool = False) -> Dict[str, Any]:
"""Index all activities from INDEX_MASTER into database"""
print("🚀 Starting activity indexing process...")
start_time = time.time()
# Clear existing data if requested
if clear_existing:
print("🗑️ Clearing existing database...")
self.db.clear_database()
# Parse activities from INDEX_MASTER
print("📖 Parsing INDEX_MASTER file...")
activities = self.parser.parse_all_categories()
if not activities:
print("❌ No activities were parsed!")
return {'success': False, 'error': 'No activities parsed'}
# Filter valid activities
valid_activities = []
for activity in activities:
if self.parser.validate_activity_completeness(activity):
valid_activities.append(activity)
else:
print(f"⚠️ Skipping incomplete activity: {activity.name[:50]}...")
print(f"✅ Validated {len(valid_activities)} activities out of {len(activities)} parsed")
if len(valid_activities) < 100:
print(f"⚠️ Warning: Only {len(valid_activities)} valid activities found. Expected 500+")
# Bulk insert into database
print("💾 Inserting activities into database...")
try:
inserted_count = self.db.bulk_insert_activities(valid_activities)
# Rebuild FTS index for optimal search performance
print("🔍 Rebuilding search index...")
self.db.rebuild_fts_index()
end_time = time.time()
indexing_time = end_time - start_time
# Generate final statistics (with error handling)
try:
stats = self._generate_indexing_stats(valid_activities, indexing_time)
stats['inserted_count'] = inserted_count
stats['success'] = True
except Exception as e:
print(f"⚠️ Error generating statistics: {e}")
stats = {
'success': True,
'inserted_count': inserted_count,
'indexing_time_seconds': indexing_time,
'error': f'Stats generation failed: {str(e)}'
}
print(f"✅ Indexing complete! {inserted_count} activities indexed in {indexing_time:.2f}s")
# Verify database state (with error handling)
try:
db_stats = self.db.get_statistics()
print(f"📊 Database now contains {db_stats['total_activities']} activities")
except Exception as e:
print(f"⚠️ Error getting database statistics: {e}")
print(f"📊 Database insertion completed, statistics unavailable")
return stats
except Exception as e:
print(f"❌ Error during database insertion: {e}")
return {'success': False, 'error': str(e)}
def index_specific_category(self, category_code: str) -> Dict[str, Any]:
"""Index activities from a specific category only"""
print(f"🎯 Indexing specific category: {category_code}")
# Load content and parse specific category
if not self.parser.load_content():
return {'success': False, 'error': 'Could not load INDEX_MASTER'}
category_name = self.parser.category_mapping.get(category_code)
if not category_name:
return {'success': False, 'error': f'Unknown category code: {category_code}'}
activities = self.parser.parse_category_section(category_code, category_name)
if not activities:
return {'success': False, 'error': f'No activities found in category {category_code}'}
# Filter valid activities
valid_activities = [a for a in activities if self.parser.validate_activity_completeness(a)]
try:
inserted_count = self.db.bulk_insert_activities(valid_activities)
return {
'success': True,
'category': category_name,
'inserted_count': inserted_count,
'total_parsed': len(activities),
'valid_activities': len(valid_activities)
}
except Exception as e:
return {'success': False, 'error': str(e)}
def _generate_indexing_stats(self, activities: List[Activity], indexing_time: float) -> Dict[str, Any]:
"""Generate comprehensive indexing statistics"""
# Get parser statistics
parser_stats = self.parser.get_parsing_statistics()
# Calculate additional metrics
categories = {}
age_ranges = {}
durations = {}
materials = {}
for activity in activities:
# Category breakdown
if activity.category in categories:
categories[activity.category] += 1
else:
categories[activity.category] = 1
# Age range analysis (with safety check)
try:
age_key = activity.get_age_range_display() or "nespecificat"
age_ranges[age_key] = age_ranges.get(age_key, 0) + 1
except Exception as e:
print(f"Warning: Error getting age range for activity {activity.name}: {e}")
age_ranges["nespecificat"] = age_ranges.get("nespecificat", 0) + 1
# Duration analysis (with safety check)
try:
duration_key = activity.get_duration_display() or "nespecificat"
durations[duration_key] = durations.get(duration_key, 0) + 1
except Exception as e:
print(f"Warning: Error getting duration for activity {activity.name}: {e}")
durations["nespecificat"] = durations.get("nespecificat", 0) + 1
# Materials analysis (with safety check)
try:
materials_key = activity.get_materials_display() or "nespecificat"
materials[materials_key] = materials.get(materials_key, 0) + 1
except Exception as e:
print(f"Warning: Error getting materials for activity {activity.name}: {e}")
materials["nespecificat"] = materials.get("nespecificat", 0) + 1
return {
'indexing_time_seconds': indexing_time,
'parsing_stats': parser_stats,
'distribution': {
'categories': categories,
'age_ranges': age_ranges,
'durations': durations,
'materials': materials
},
'quality_metrics': {
'completion_rate': parser_stats.get('completion_rate', 0),
'average_description_length': parser_stats.get('average_description_length', 0),
'activities_with_metadata': sum(1 for a in activities if a.age_group_min or a.participants_min or a.duration_min)
}
}
def verify_indexing_quality(self) -> Dict[str, Any]:
"""Verify the quality of indexed data"""
try:
# Get database statistics
db_stats = self.db.get_statistics()
# Check for minimum activity count
total_activities = db_stats['total_activities']
meets_minimum = total_activities >= 500
# Check category distribution
categories = db_stats.get('categories', {})
category_coverage = len(categories)
# Sample some activities to check quality
sample_activities = self.db.search_activities(limit=10)
quality_issues = []
for activity in sample_activities:
if not activity.get('description') or len(activity['description']) < 10:
quality_issues.append(f"Activity {activity.get('name', 'Unknown')} has insufficient description")
if not activity.get('category'):
quality_issues.append(f"Activity {activity.get('name', 'Unknown')} missing category")
return {
'total_activities': total_activities,
'meets_minimum_requirement': meets_minimum,
'minimum_target': 500,
'category_coverage': category_coverage,
'expected_categories': len(self.parser.category_mapping),
'quality_issues': quality_issues,
'quality_score': max(0, 100 - len(quality_issues) * 10),
'database_stats': db_stats
}
except Exception as e:
return {'error': str(e), 'quality_score': 0}
def get_indexing_progress(self) -> Dict[str, Any]:
"""Get current indexing progress and status"""
try:
db_stats = self.db.get_statistics()
# Calculate progress towards 500+ activities goal
total_activities = db_stats['total_activities']
target_activities = 500
progress_percentage = min(100, (total_activities / target_activities) * 100)
return {
'current_activities': total_activities,
'target_activities': target_activities,
'progress_percentage': progress_percentage,
'status': 'completed' if total_activities >= target_activities else 'in_progress',
'categories_indexed': list(db_stats.get('categories', {}).keys()),
'database_size_mb': db_stats.get('database_size_bytes', 0) / (1024 * 1024)
}
except Exception as e:
return {'error': str(e), 'status': 'error'}

View File

@@ -1,340 +0,0 @@
"""
Advanced parser for INDEX_MASTER_JOCURI_ACTIVITATI.md
Extracts 500+ individual activities with full details
"""
import re
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from app.models.activity import Activity
class IndexMasterParser:
"""Advanced parser for extracting real activities from INDEX_MASTER"""
def __init__(self, index_file_path: str):
"""Initialize parser with INDEX_MASTER file path"""
self.index_file_path = Path(index_file_path)
self.content = ""
self.activities = []
# Category mapping for main sections (exact match from file)
self.category_mapping = {
'[A]': 'JOCURI CERCETĂȘEȘTI ȘI SCOUT',
'[B]': 'TEAM BUILDING ȘI COMUNICARE',
'[C]': 'CAMPING ȘI ACTIVITĂȚI EXTERIOR',
'[D]': 'ESCAPE ROOM ȘI PUZZLE-URI',
'[E]': 'ORIENTARE ȘI BUSOLE',
'[F]': 'PRIMUL AJUTOR ȘI SIGURANȚA',
'[G]': 'ACTIVITĂȚI EDUCAȚIONALE',
'[H]': 'RESURSE SPECIALE'
}
def load_content(self) -> bool:
"""Load and validate INDEX_MASTER content"""
try:
if not self.index_file_path.exists():
print(f"❌ INDEX_MASTER file not found: {self.index_file_path}")
return False
with open(self.index_file_path, 'r', encoding='utf-8') as f:
self.content = f.read()
if len(self.content) < 1000: # Sanity check
print(f"⚠️ INDEX_MASTER file seems too small: {len(self.content)} chars")
return False
print(f"✅ Loaded INDEX_MASTER: {len(self.content)} characters")
return True
except Exception as e:
print(f"❌ Error loading INDEX_MASTER: {e}")
return False
def parse_all_categories(self) -> List[Activity]:
"""Parse all categories and extract individual activities"""
if not self.load_content():
return []
print("🔍 Starting comprehensive parsing of INDEX_MASTER...")
# Parse each main category
for category_code, category_name in self.category_mapping.items():
print(f"\n📂 Processing category {category_code}: {category_name}")
category_activities = self.parse_category_section(category_code, category_name)
self.activities.extend(category_activities)
print(f" ✅ Extracted {len(category_activities)} activities")
print(f"\n🎯 Total activities extracted: {len(self.activities)}")
return self.activities
def parse_category_section(self, category_code: str, category_name: str) -> List[Activity]:
"""Parse a specific category section"""
activities = []
# Find the category section - exact pattern match
# Look for the actual section, not the table of contents
pattern = rf"^## {re.escape(category_code)} {re.escape(category_name)}\s*$"
matches = list(re.finditer(pattern, self.content, re.MULTILINE | re.IGNORECASE))
if not matches:
print(f" ⚠️ Category section not found: {category_code}")
return activities
# Take the last match (should be the actual section, not TOC)
match = matches[-1]
print(f" 📍 Found section at position {match.start()}")
# Extract content until next main category or end
start_pos = match.end()
# Find next main category (look for complete header)
next_category_pattern = r"^## \[[A-H]\] [A-ZĂÂÎȘȚ]"
next_match = re.search(next_category_pattern, self.content[start_pos:], re.MULTILINE)
if next_match:
end_pos = start_pos + next_match.start()
section_content = self.content[start_pos:end_pos]
else:
section_content = self.content[start_pos:]
# Parse subsections within the category
activities.extend(self._parse_subsections(section_content, category_name))
return activities
def _parse_subsections(self, section_content: str, category_name: str) -> List[Activity]:
"""Parse subsections within a category"""
activities = []
# Find all subsections (### markers)
subsection_pattern = r"^### (.+?)$"
subsections = re.finditer(subsection_pattern, section_content, re.MULTILINE)
subsection_list = list(subsections)
for i, subsection in enumerate(subsection_list):
subsection_title = subsection.group(1).strip()
subsection_start = subsection.end()
# Find end of subsection
if i + 1 < len(subsection_list):
subsection_end = subsection_list[i + 1].start()
else:
subsection_end = len(section_content)
subsection_text = section_content[subsection_start:subsection_end]
# Parse individual games in this subsection
subsection_activities = self._parse_games_in_subsection(
subsection_text, category_name, subsection_title
)
activities.extend(subsection_activities)
return activities
def _parse_games_in_subsection(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]:
"""Parse individual games within a subsection"""
activities = []
# Look for "Exemple de jocuri:" sections
examples_pattern = r"\*\*Exemple de jocuri:\*\*\s*\n(.*?)(?=\n\*\*|$)"
examples_matches = re.finditer(examples_pattern, subsection_text, re.DOTALL)
for examples_match in examples_matches:
examples_text = examples_match.group(1)
# Extract individual games (numbered list)
game_pattern = r"^(\d+)\.\s*\*\*(.+?)\*\*\s*-\s*(.+?)$"
games = re.finditer(game_pattern, examples_text, re.MULTILINE)
for game_match in games:
game_number = game_match.group(1)
game_name = game_match.group(2).strip()
game_description = game_match.group(3).strip()
# Extract metadata from subsection
metadata = self._extract_subsection_metadata(subsection_text)
# Create activity
activity = Activity(
name=game_name,
description=game_description,
category=category_name,
subcategory=subsection_title,
source_file=f"INDEX_MASTER_JOCURI_ACTIVITATI.md",
page_reference=f"{category_name} > {subsection_title} > #{game_number}",
**metadata
)
activities.append(activity)
# Also extract from direct activity descriptions without "Exemple de jocuri"
activities.extend(self._parse_direct_activities(subsection_text, category_name, subsection_title))
return activities
def _extract_subsection_metadata(self, subsection_text: str) -> Dict:
"""Extract metadata from subsection text"""
metadata = {}
# Extract participants info
participants_pattern = r"\*\*Participanți:\*\*\s*(.+?)(?:\n|\*\*)"
participants_match = re.search(participants_pattern, subsection_text)
if participants_match:
participants_text = participants_match.group(1).strip()
participants = self._parse_participants(participants_text)
metadata.update(participants)
# Extract duration
duration_pattern = r"\*\*Durata:\*\*\s*(.+?)(?:\n|\*\*)"
duration_match = re.search(duration_pattern, subsection_text)
if duration_match:
duration_text = duration_match.group(1).strip()
duration = self._parse_duration(duration_text)
metadata.update(duration)
# Extract materials
materials_pattern = r"\*\*Materiale:\*\*\s*(.+?)(?:\n|\*\*)"
materials_match = re.search(materials_pattern, subsection_text)
if materials_match:
materials_text = materials_match.group(1).strip()
metadata['materials_list'] = materials_text
metadata['materials_category'] = self._categorize_materials(materials_text)
# Extract keywords
keywords_pattern = r"\*\*Cuvinte cheie:\*\*\s*(.+?)(?:\n|\*\*)"
keywords_match = re.search(keywords_pattern, subsection_text)
if keywords_match:
metadata['keywords'] = keywords_match.group(1).strip()
return metadata
def _parse_participants(self, participants_text: str) -> Dict:
"""Parse participants information"""
result = {}
# Look for number ranges like "8-30 copii" or "5-15 persoane"
range_pattern = r"(\d+)-(\d+)"
range_match = re.search(range_pattern, participants_text)
if range_match:
result['participants_min'] = int(range_match.group(1))
result['participants_max'] = int(range_match.group(2))
else:
# Look for single numbers
number_pattern = r"(\d+)\+"
number_match = re.search(number_pattern, participants_text)
if number_match:
result['participants_min'] = int(number_match.group(1))
# Extract age information
age_pattern = r"(\d+)-(\d+)\s*ani"
age_match = re.search(age_pattern, participants_text)
if age_match:
result['age_group_min'] = int(age_match.group(1))
result['age_group_max'] = int(age_match.group(2))
return result
def _parse_duration(self, duration_text: str) -> Dict:
"""Parse duration information"""
result = {}
# Look for time ranges like "5-20 minute" or "15-30min"
range_pattern = r"(\d+)-(\d+)\s*(?:minute|min)"
range_match = re.search(range_pattern, duration_text)
if range_match:
result['duration_min'] = int(range_match.group(1))
result['duration_max'] = int(range_match.group(2))
else:
# Look for single duration
single_pattern = r"(\d+)\+?\s*(?:minute|min)"
single_match = re.search(single_pattern, duration_text)
if single_match:
result['duration_min'] = int(single_match.group(1))
return result
def _categorize_materials(self, materials_text: str) -> str:
"""Categorize materials into simple categories"""
materials_lower = materials_text.lower()
if any(word in materials_lower for word in ['fără', 'nu necesare', 'nimic', 'minime']):
return 'Fără materiale'
elif any(word in materials_lower for word in ['hârtie', 'creion', 'marker', 'simple']):
return 'Materiale simple'
elif any(word in materials_lower for word in ['computer', 'proiector', 'echipament', 'complexe']):
return 'Materiale complexe'
else:
return 'Materiale variate'
def _parse_direct_activities(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]:
"""Parse activities that are described directly without 'Exemple de jocuri' section"""
activities = []
# Look for activity descriptions in sections that don't have "Exemple de jocuri"
if "**Exemple de jocuri:**" not in subsection_text:
# Try to extract from file descriptions
file_pattern = r"\*\*Fișier:\*\*\s*`([^`]+)`.*?\*\*(.+?)\*\*"
file_matches = re.finditer(file_pattern, subsection_text, re.DOTALL)
for file_match in file_matches:
file_name = file_match.group(1)
description_part = file_match.group(2)
# Create a general activity for this file
activity = Activity(
name=f"Activități din {file_name}",
description=f"Colecție de activități din fișierul {file_name}. {description_part[:200]}...",
category=category_name,
subcategory=subsection_title,
source_file=file_name,
page_reference=f"{category_name} > {subsection_title}",
**self._extract_subsection_metadata(subsection_text)
)
activities.append(activity)
return activities
def validate_activity_completeness(self, activity: Activity) -> bool:
"""Validate that an activity has all necessary fields"""
required_fields = ['name', 'description', 'category', 'source_file']
for field in required_fields:
if not getattr(activity, field) or not getattr(activity, field).strip():
return False
# Check minimum description length
if len(activity.description) < 10:
return False
return True
def get_parsing_statistics(self) -> Dict:
"""Get statistics about the parsing process"""
if not self.activities:
return {'total_activities': 0}
category_counts = {}
valid_activities = 0
for activity in self.activities:
# Count by category
if activity.category in category_counts:
category_counts[activity.category] += 1
else:
category_counts[activity.category] = 1
# Count valid activities
if self.validate_activity_completeness(activity):
valid_activities += 1
return {
'total_activities': len(self.activities),
'valid_activities': valid_activities,
'completion_rate': (valid_activities / len(self.activities)) * 100 if self.activities else 0,
'category_breakdown': category_counts,
'average_description_length': sum(len(a.description) for a in self.activities) / len(self.activities) if self.activities else 0
}

View File

@@ -5,8 +5,19 @@ Enhanced search with FTS5 and intelligent filtering
from typing import List, Dict, Any, Optional
from app.models.database import DatabaseManager
from app.config_taxonomy import NON_GAME_CONTENT_TYPES
import re
# Category slugs that are themselves "non-game" — selecting one of these as a
# category filter also lifts the default non-game content_type exclusion.
NON_GAME_CATEGORIES = {"retete", "cantece-ceremonii"}
# When a Python-side post-filter is active the DB LIMIT is applied *before*
# filtering, so we over-fetch to still satisfy the caller's `limit`.
_OVERSCAN_FACTOR = 5
_OVERSCAN_CAP = 2000
class SearchService:
"""Enhanced search service with intelligent query processing"""
@@ -31,15 +42,65 @@ class SearchService:
# Map web filters to database fields
db_filters = self._map_filters_to_db_fields(filters)
# content_type and language are filtered in Python: the DB layer does
# not expose them as query parameters. The DEFAULT search excludes the
# non-game content types (rețete / cântece / ceremonii) — they surface
# only when the user explicitly filters that content_type, or picks a
# non-game category. See plan §6.
content_type, exclude_non_game = self._resolve_content_type_filter(filters)
language = (filters.get('language') or '').strip().lower() or None
post_filtering = bool(content_type or exclude_non_game or language)
# Over-fetch when post-filtering so the final list can still reach `limit`.
fetch_limit = min(limit * _OVERSCAN_FACTOR, _OVERSCAN_CAP) if post_filtering else limit
# Perform database search
results = self.db.search_activities(
search_text=processed_search,
**db_filters,
limit=limit
limit=fetch_limit
)
# Post-process results for relevance and ranking
return self._post_process_results(results, processed_search, filters)
# Apply content_type / language post-filters
results = self._apply_content_type_filter(results, content_type, exclude_non_game)
if language:
results = [r for r in results
if (r.get('language') or '').strip().lower() == language]
# Post-process results for relevance and ranking, then honour `limit`
results = self._post_process_results(results, processed_search, filters)
return results[:limit]
def _resolve_content_type_filter(self, filters: Dict[str, str]):
"""Determine the content_type post-filter.
Returns (explicit_content_type | None, exclude_non_game: bool):
- an explicit `content_type` filter → that value, no exclusion;
- a `category` filter on a non-game category → no exclusion;
- otherwise → default search, exclude non-game content types.
"""
content_type = (filters.get('content_type') or '').strip()
if content_type:
return content_type, False
category = (filters.get('category') or '').strip()
if category in NON_GAME_CATEGORIES:
return None, False
return None, True
def _apply_content_type_filter(self,
results: List[Dict[str, Any]],
content_type: Optional[str],
exclude_non_game: bool) -> List[Dict[str, Any]]:
"""Filter results by content_type (explicit include vs default exclude)."""
if content_type:
return [r for r in results
if (r.get('content_type') or '') == content_type]
if exclude_non_game:
# Rows with NULL/unknown content_type are kept — only the known
# non-game types are dropped from the default search.
return [r for r in results
if (r.get('content_type') or '') not in NON_GAME_CONTENT_TYPES]
return results
def _process_search_text(self, search_text: Optional[str]) -> Optional[str]:
"""Process and enhance search text for better FTS5 results"""
@@ -83,6 +144,12 @@ class SearchService:
if not filter_value or not filter_value.strip():
continue
# content_type / language are NOT database query params — they are
# applied as Python post-filters in search_activities(). Skip them
# here so they never reach DatabaseManager.search_activities().
if filter_key in ('content_type', 'language'):
continue
# Map filter types to database fields
if filter_key == 'category':
db_filters['category'] = filter_value
@@ -177,7 +244,8 @@ class SearchService:
boost_score = 0
# Check name matches (highest priority)
name_lower = result.get('name', '').lower()
# NB: use `or ''` — nullable columns come back as None, not ''.
name_lower = (result.get('name') or '').lower()
for term in search_terms:
if term in name_lower:
boost_score += 10
@@ -185,13 +253,13 @@ class SearchService:
boost_score += 5 # Extra boost for name starts with term
# Check description matches
desc_lower = result.get('description', '').lower()
desc_lower = (result.get('description') or '').lower()
for term in search_terms:
if term in desc_lower:
boost_score += 3
# Check keywords matches
keywords_lower = result.get('keywords', '').lower()
keywords_lower = (result.get('keywords') or '').lower()
for term in search_terms:
if term in keywords_lower:
boost_score += 5
@@ -280,11 +348,14 @@ class SearchService:
return []
try:
# Search for activities that match the partial query
# Search for activities that match the partial query.
# Over-fetch then drop non-game content types so autocomplete
# mirrors the default search (no rețete / cântece / ceremonii).
results = self.db.search_activities(
search_text=f'"{partial_query}"',
limit=limit * 2
limit=limit * 6
)
results = self._apply_content_type_filter(results, None, True)
suggestions = []
seen = set()

View File

@@ -15,7 +15,13 @@
<header class="activity-detail-header">
<div class="activity-title-section">
<h1 class="activity-detail-title">{{ activity.name }}</h1>
<span class="activity-category-badge">{{ activity.category }}</span>
<span class="activity-category-badge">{{ display_names.get(activity.category, activity.category) }}</span>
{% if activity.content_type %}
<span class="activity-content-type-badge">{{ display_names.get(activity.content_type, activity.content_type) }}</span>
{% endif %}
{% if activity.needs_review %}
<span class="activity-badge needs-review" title="Această activitate necesită verificare">⚠ De verificat</span>
{% endif %}
</div>
{% if activity.subcategory %}

View File

@@ -36,7 +36,31 @@
<select name="category" id="category" class="filter-select">
<option value="">Toate categoriile</option>
{% for category in filters.category %}
<option value="{{ category }}">{{ category }}</option>
<option value="{{ category }}">{{ display_names.get(category, category) }}</option>
{% endfor %}
</select>
</div>
{% endif %}
{% if filters.content_type %}
<div class="filter-group">
<label for="content_type" class="filter-label">Tip conținut</label>
<select name="content_type" id="content_type" class="filter-select">
<option value="">Doar jocuri și activități</option>
{% for content_type in filters.content_type %}
<option value="{{ content_type }}">{{ display_names.get(content_type, content_type) }}</option>
{% endfor %}
</select>
</div>
{% endif %}
{% if filters.language %}
<div class="filter-group">
<label for="language" class="filter-label">Limbă</label>
<select name="language" id="language" class="filter-select">
<option value="">Toate limbile</option>
{% for language in filters.language %}
<option value="{{ language }}">{{ display_names.get(language, language) }}</option>
{% endfor %}
</select>
</div>

View File

@@ -24,7 +24,29 @@
<option value="">Toate categoriile</option>
{% for category in filters.category %}
<option value="{{ category }}" {% if applied_filters.category == category %}selected{% endif %}>
{{ category }}
{{ display_names.get(category, category) }}
</option>
{% endfor %}
</select>
{% endif %}
{% if filters.content_type %}
<select name="content_type" class="filter-select compact">
<option value="">Doar jocuri și activități</option>
{% for content_type in filters.content_type %}
<option value="{{ content_type }}" {% if applied_filters.content_type == content_type %}selected{% endif %}>
{{ display_names.get(content_type, content_type) }}
</option>
{% endfor %}
</select>
{% endif %}
{% if filters.language %}
<select name="language" class="filter-select compact">
<option value="">Toate limbile</option>
{% for language in filters.language %}
<option value="{{ language }}" {% if applied_filters.language == language %}selected{% endif %}>
{{ display_names.get(language, language) }}
</option>
{% endfor %}
</select>
@@ -109,7 +131,10 @@
{{ activity.name }}
</a>
</h3>
<span class="activity-category">{{ activity.category }}</span>
<span class="activity-category">{{ display_names.get(activity.category, activity.category) }}</span>
{% if activity.needs_review %}
<span class="activity-badge needs-review" title="Această activitate necesită verificare">⚠ De verificat</span>
{% endif %}
</header>
<div class="activity-content">

View File

@@ -7,11 +7,17 @@ from flask import Blueprint, request, render_template, jsonify, current_app
from app.models.database import DatabaseManager
from app.models.activity import Activity
from app.services.search import SearchService
from app.config_taxonomy import CATEGORIES, CONTENT_TYPES
import os
from pathlib import Path
bp = Blueprint('main', __name__)
# Slug -> Romanian display name. Category and content_type slugs never collide,
# so a single flat map is enough for the UI filter labels.
LANGUAGE_NAMES = {'ro': 'Română', 'en': 'Engleză'}
DISPLAY_NAMES = {**CATEGORIES, **CONTENT_TYPES, **LANGUAGE_NAMES}
# Initialize database manager (will be configured in application factory)
def get_db_manager():
"""Get database manager instance"""
@@ -38,6 +44,7 @@ def index():
return render_template('index.html',
filters=filter_options,
display_names=DISPLAY_NAMES,
stats=stats)
except Exception as e:
@@ -45,6 +52,7 @@ def index():
# Fallback with empty filters
return render_template('index.html',
filters={},
display_names=DISPLAY_NAMES,
stats={'total_activities': 0})
@bp.route('/search', methods=['GET', 'POST'])
@@ -82,6 +90,7 @@ def search():
search_query=search_query,
applied_filters=filters,
filters=filter_options,
display_names=DISPLAY_NAMES,
results_count=len(activities))
except Exception as e:
@@ -91,6 +100,7 @@ def search():
search_query='',
applied_filters={},
filters={},
display_names=DISPLAY_NAMES,
results_count=0,
error=str(e))
@@ -121,6 +131,7 @@ def activity_detail(activity_id):
return render_template('activity.html',
activity=activity,
display_names=DISPLAY_NAMES,
similar_activities=similar_activities)
except Exception as e:

View File

@@ -0,0 +1,81 @@
# SUBAGENT — Activity extraction
You are a subagent in the game-library extraction pipeline. You extract
educational activities (games, team-building, scouting, recipes, songs,
ceremonies) from one chunk of a source document into structured JSON.
## Your task
1. **Read ONLY the chunk you were assigned.** Do not read other chunks, other
files, or the original document. The chunk is a `.txt` file with
`--- PAGE N ---` markers.
2. Identify **every distinct activity** in the chunk.
3. For each activity, fill the schema in `scripts/activity_schema.json`.
4. Write the result to `data/extracted/<chunk_key>.json`.
## What counts as "a distinct activity"
A distinct activity is a self-contained game/activity/recipe/song/ceremony with
its own name and a real description of how to do it. It is NOT:
- a bare mention or a cross-reference with no description — **skip it**;
- a sub-variant of an activity already extracted — fold it into `variations`;
- a heading, a table of contents entry, or running page chrome.
If the same activity is split across a page boundary inside your chunk, treat it
as **one** activity and combine the text.
## Output format
The file is one JSON object: a `header` plus an `activities` array.
```json
{
"header": {
"source_id": "<set from the prompt>",
"chunk_key": "<set from the prompt>",
"source_hash": "<set from the prompt>",
"schema_version": "1.0",
"prompt_version": "1.0",
"chunk_range": "pages 1-20"
},
"activities": [ ... ]
}
```
## Rules for each activity
- **`name`** — the activity's real name (≥3 characters).
- **`description`** — real prose describing the activity. No hard length limit,
but it must actually describe what happens.
- **`rules`** — how it is played / carried out, if the source gives rules.
- **`category`** — exactly one taxonomy slug (see the `enum` in the schema):
`jocuri-cercetasesti`, `team-building`, `icebreakers`, `camp-outdoor`,
`wide-games`, `orientare`, `prim-ajutor`, `escape-room-puzzle`,
`creative-stem`, `sports-active`, `cantece-ceremonii`, `retete`,
`supravietuire`, `integrare-incluziune`, `conflict-empatie`, `altele`.
When unsure, use `altele`.
- **`content_type`** — the FORM of the content, independent of category:
`joc`, `activitate`, `reteta`, `cantec`, or `ceremonie`.
- **`language`** — `ro` or `en` (the language the activity is written in).
- **`source_excerpt`** — **MANDATORY.** A short quote (one or two sentences)
copied **verbatim** from the chunk. This is the anti-hallucination anchor: it
is checked as a fuzzy substring of the chunk, and invented quotes are
rejected.
- **`page_reference`** — **MANDATORY.** The `--- PAGE N ---` marker(s) the
activity came from, e.g. `"page 14"` or `"pages 14-15"`.
- **`extraction_confidence`** — `high`, `med`, or `low`. Use `low` when the
source text for the activity is thin or ambiguous.
## Never invent data
- Do **not** invent ages, participant counts, or durations. If the source does
not state them, leave those fields `null`.
- Do **not** paraphrase the `source_excerpt` — copy it character for character.
- Better to extract fewer activities accurately than to pad the output.
## Before you finish
- Every activity has a non-empty `source_excerpt` and `page_reference`.
- The file validates against `scripts/activity_schema.json`.
- You only used text from your assigned chunk.

View File

@@ -0,0 +1,110 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Game-library extraction output",
"description": "One subagent output file: a header carrying provenance/version metadata plus the list of activities extracted from a single chunk.",
"type": "object",
"required": ["header", "activities"],
"additionalProperties": false,
"properties": {
"header": {
"type": "object",
"required": ["source_hash", "schema_version", "prompt_version", "chunk_range"],
"additionalProperties": true,
"properties": {
"source_hash": {"type": "string", "minLength": 8},
"schema_version": {"type": "string"},
"prompt_version": {"type": "string"},
"chunk_range": {"type": "string"},
"source_id": {"type": ["string", "null"]},
"chunk_key": {"type": ["string", "null"]}
}
},
"activities": {
"type": "array",
"items": {"$ref": "#/definitions/activity"}
}
},
"definitions": {
"activity": {
"type": "object",
"required": [
"name",
"description",
"category",
"content_type",
"language",
"extraction_confidence",
"source_excerpt",
"page_reference"
],
"additionalProperties": false,
"properties": {
"name": {"type": "string", "minLength": 3},
"description": {"type": "string", "minLength": 1},
"rules": {"type": ["string", "null"]},
"variations": {"type": ["string", "null"]},
"category": {
"type": "string",
"enum": [
"jocuri-cercetasesti",
"team-building",
"icebreakers",
"camp-outdoor",
"wide-games",
"orientare",
"prim-ajutor",
"escape-room-puzzle",
"creative-stem",
"sports-active",
"cantece-ceremonii",
"retete",
"supravietuire",
"integrare-incluziune",
"conflict-empatie",
"altele"
]
},
"subcategory": {"type": ["string", "null"]},
"content_type": {
"type": "string",
"enum": ["joc", "activitate", "reteta", "cantec", "ceremonie"]
},
"language": {"type": "string", "enum": ["ro", "en"]},
"extraction_confidence": {
"type": "string",
"enum": ["high", "med", "low"]
},
"source_excerpt": {"type": "string", "minLength": 1},
"page_reference": {"type": "string", "minLength": 1},
"source_file": {"type": ["string", "null"]},
"age_group_min": {"type": ["integer", "null"], "minimum": 0},
"age_group_max": {"type": ["integer", "null"], "minimum": 0},
"participants_min": {"type": ["integer", "null"], "minimum": 0},
"participants_max": {"type": ["integer", "null"], "minimum": 0},
"duration_min": {"type": ["integer", "null"], "minimum": 0},
"duration_max": {"type": ["integer", "null"], "minimum": 0},
"materials_category": {"type": ["string", "null"]},
"materials_list": {
"type": ["array", "null"],
"items": {"type": "string"}
},
"skills_developed": {
"type": ["array", "null"],
"items": {"type": "string"}
},
"difficulty_level": {
"type": ["string", "null"],
"enum": ["usor", "mediu", "dificil", null]
},
"keywords": {
"type": ["array", "null"],
"items": {"type": "string"}
},
"tags": {
"type": ["array", "null"],
"items": {"type": "string"}
}
}
}
}
}

639
scripts/build_database.py Normal file
View File

@@ -0,0 +1,639 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
build_database.py — build data/activities.db from the subagent extraction JSON.
Replaces the old import_claude_activities.py. Pipeline (plan §4):
1. `--rebuild` builds into data/activities.db.tmp; on success the live DB is
backed up to data/activities.db.bak and the tmp file is swapped in with an
atomic os.replace. A mid-build crash leaves the live DB untouched.
2. Every data/extracted/*.json is validated against scripts/activity_schema.json;
invalid files are moved to data/extracted/_rejected/ with an error log.
2b. Each source_excerpt must appear as a fuzzy substring (rapidfuzz
partial_ratio >= 90) of its source chunk — non-matches are hallucinations
and the activity is dropped (logged to _rejected/).
3. `category` is normalized to a valid taxonomy slug (fallback `altele`).
4. Dedup (D5): group by exact normalized_name, never across languages; within a
group rapidfuzz on descriptions — >=85 auto-merge, 60-85 borderline (keep
both, needs_review), <60 separate variants.
5. data/review_decisions.json is applied before insert.
6. Bulk insert into the tmp DB, populate the categories table, rebuild FTS.
7. A QA report is printed.
Usage:
python scripts/build_database.py --rebuild
"""
from __future__ import annotations
import argparse
import json
import os
import shutil
import sys
from collections import defaultdict
from pathlib import Path
from typing import Any, Optional
SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
if _p not in sys.path:
sys.path.insert(0, _p)
from app.config_taxonomy import ( # noqa: E402
category_display_name,
normalize_category,
normalize_content_type,
)
from app.models.activity import Activity # noqa: E402
from app.models.database import DatabaseManager # noqa: E402
from import_common import ( # noqa: E402
DEFAULT_SCHEMA_PATH,
content_key,
excerpt_matches,
find_chunk_text,
iter_extraction_files,
load_schema,
normalize_name,
source_path_for,
)
# dedup thresholds (rapidfuzz token_sort_ratio, 0..100 scale)
AUTO_MERGE_THRESHOLD = 85.0
BORDERLINE_THRESHOLD = 60.0
# --------------------------------------------------------------------------
# extraction dict -> Activity
# --------------------------------------------------------------------------
def _csv(value: Any) -> Optional[str]:
"""Schema arrays -> comma string for the (TEXT) DB columns."""
if value is None:
return None
if isinstance(value, str):
return value.strip() or None
if isinstance(value, (list, tuple)):
parts = [str(v).strip() for v in value if str(v).strip()]
return ", ".join(parts) or None
return str(value)
def _split_csv(value: Optional[str]) -> list[str]:
if not value:
return []
return [p.strip() for p in str(value).split(",") if p.strip()]
def dict_to_activity(adict: dict, source_file: str) -> Activity:
"""Build an Activity from one extraction-JSON activity object."""
tags = adict.get("tags") or []
if isinstance(tags, str):
tags = _split_csv(tags)
source_files = adict.get("source_files") or []
if isinstance(source_files, str):
source_files = _split_csv(source_files)
if source_file and source_file not in source_files:
source_files = [source_file, *source_files]
return Activity(
name=(adict.get("name") or "").strip(),
description=(adict.get("description") or "").strip(),
rules=adict.get("rules"),
variations=adict.get("variations"),
category=normalize_category(adict.get("category", "")),
subcategory=adict.get("subcategory"),
content_type=normalize_content_type(adict.get("content_type", "")),
source_file=source_file,
source_files=list(source_files),
page_reference=adict.get("page_reference"),
source_excerpt=adict.get("source_excerpt"),
age_group_min=adict.get("age_group_min"),
age_group_max=adict.get("age_group_max"),
participants_min=adict.get("participants_min"),
participants_max=adict.get("participants_max"),
duration_min=adict.get("duration_min"),
duration_max=adict.get("duration_max"),
materials_category=adict.get("materials_category"),
materials_list=_csv(adict.get("materials_list")),
skills_developed=_csv(adict.get("skills_developed")),
difficulty_level=adict.get("difficulty_level"),
keywords=_csv(adict.get("keywords")),
tags=list(tags),
language=adict.get("language"),
extraction_confidence=adict.get("extraction_confidence"),
)
# --------------------------------------------------------------------------
# step 3 — category normalization is done in dict_to_activity; a non-taxonomy
# value silently falls back to `altele`. This logs the substitutions.
# --------------------------------------------------------------------------
def log_category_fallbacks(raw_pairs: list[tuple[str, str]]) -> list[str]:
"""raw_pairs = (original, slug); return human-readable fallback messages."""
msgs = []
for original, slug in raw_pairs:
if slug == "altele" and normalize_name(original or "") not in ("", "altele"):
msgs.append(f"category '{original}' -> altele (not in taxonomy)")
return msgs
# --------------------------------------------------------------------------
# step 4 — dedup
# --------------------------------------------------------------------------
def _longest(*values: Optional[str]) -> Optional[str]:
best: Optional[str] = None
for v in values:
if v and (best is None or len(v) > len(best)):
best = v
return best
def _union_csv(values: list[Optional[str]]) -> Optional[str]:
seen: list[str] = []
for value in values:
for item in _split_csv(value):
if item not in seen:
seen.append(item)
return ", ".join(seen) or None
def merge_cluster(cluster: list[Activity]) -> Activity:
"""Collapse a cluster of duplicate activities into one merged Activity."""
if len(cluster) == 1:
return cluster[0]
# representative = the one with the longest description
rep = max(cluster, key=lambda a: len(a.description or ""))
merged = Activity(
name=rep.name,
description=_longest(*(a.description for a in cluster)) or rep.description,
rules=_longest(*(a.rules for a in cluster)),
variations=_longest(*(a.variations for a in cluster)),
category=rep.category,
subcategory=rep.subcategory,
content_type=rep.content_type,
source_file=rep.source_file,
page_reference=rep.page_reference,
source_excerpt=rep.source_excerpt,
age_group_min=rep.age_group_min,
age_group_max=rep.age_group_max,
participants_min=rep.participants_min,
participants_max=rep.participants_max,
duration_min=rep.duration_min,
duration_max=rep.duration_max,
materials_category=rep.materials_category,
materials_list=_union_csv([a.materials_list for a in cluster]),
skills_developed=_union_csv([a.skills_developed for a in cluster]),
difficulty_level=rep.difficulty_level,
keywords=_union_csv([a.keywords for a in cluster]),
language=rep.language,
extraction_confidence=rep.extraction_confidence,
)
# union of tags
tags: list[str] = []
for a in cluster:
for t in a.tags or []:
if t not in tags:
tags.append(t)
merged.tags = tags
# accumulate every source the activity was seen in
sources: list[str] = []
for a in cluster:
for s in [a.source_file, *(a.source_files or [])]:
if s and s not in sources:
sources.append(s)
merged.source_files = sources
# popularity_score++ per merged duplicate (plan §4)
merged.popularity_score = max(a.popularity_score for a in cluster) + (len(cluster) - 1)
return merged
def dedup_activities(activities: list[Activity]) -> tuple[list[Activity], dict]:
"""
Dedup per plan D5.
Groups by (normalized_name, language) — different languages are NEVER
merged. Within a group, descriptions are clustered with rapidfuzz:
>= 85 -> same cluster (auto-merge)
60-85 -> borderline: kept as separate clusters, both flagged needs_review
< 60 -> separate variants
"""
from rapidfuzz import fuzz
groups: dict[tuple, list[Activity]] = defaultdict(list)
for act in activities:
key = (act.normalized_name or normalize_name(act.name), act.language)
groups[key].append(act)
result: list[Activity] = []
stats = {"input": len(activities), "auto_merged": 0, "borderline": 0, "output": 0}
for members in groups.values():
clusters: list[list[Activity]] = []
borderline_idx: set[int] = set()
for act in members:
best_idx, best_score = -1, -1.0
borderline_here: list[int] = []
for idx, cluster in enumerate(clusters):
score = fuzz.token_sort_ratio(
act.description or "", cluster[0].description or ""
)
if score >= AUTO_MERGE_THRESHOLD:
if score > best_score:
best_idx, best_score = idx, score
elif score >= BORDERLINE_THRESHOLD:
borderline_here.append(idx)
if best_idx >= 0:
clusters[best_idx].append(act)
else:
clusters.append([act])
new_idx = len(clusters) - 1
for bidx in borderline_here:
borderline_idx.add(bidx)
borderline_idx.add(new_idx)
for idx, cluster in enumerate(clusters):
merged = merge_cluster(cluster)
if len(cluster) > 1:
stats["auto_merged"] += len(cluster) - 1
if idx in borderline_idx:
merged.needs_review = 1
stats["borderline"] += 1
result.append(merged)
stats["output"] = len(result)
return result, stats
# --------------------------------------------------------------------------
# step 5 — review decisions
# --------------------------------------------------------------------------
def load_review_decisions(path: Path) -> dict:
if path and path.is_file():
try:
data = json.loads(path.read_text(encoding="utf-8"))
if isinstance(data, dict):
return data
except (json.JSONDecodeError, OSError):
pass
return {}
def apply_review_decisions(
activities: list[Activity], decisions: dict
) -> tuple[list[Activity], dict]:
"""
Apply data/review_decisions.json (plan §5c).
Keyed by the stable content_key. A decision of `drop` removes the row;
`keep-separate` / `merge` clear needs_review (the user has resolved it).
Rows with no decision keep needs_review and resurface in the queue.
"""
kept: list[Activity] = []
stats = {"dropped": 0, "resolved": 0}
for act in activities:
key = content_key(
act.normalized_name or normalize_name(act.name),
act.language,
act.description or "",
)
entry = decisions.get(key)
decision = entry.get("decision") if isinstance(entry, dict) else entry
if decision == "drop":
stats["dropped"] += 1
continue
if decision in ("keep-separate", "merge"):
act.needs_review = 0
stats["resolved"] += 1
kept.append(act)
return kept, stats
# --------------------------------------------------------------------------
# golden-set recall (plan §7)
# --------------------------------------------------------------------------
def _golden_names(data: Any) -> list[str]:
items = data.get("activities", data) if isinstance(data, dict) else data
names: list[str] = []
for item in items or []:
if isinstance(item, str):
names.append(item)
elif isinstance(item, dict) and item.get("name"):
names.append(item["name"])
return names
def golden_recall(golden_dir: Path, activities: list[Activity]) -> Optional[dict]:
if not golden_dir or not golden_dir.is_dir():
return None
found = {normalize_name(a.name) for a in activities}
expected, hits = 0, 0
for gf in sorted(golden_dir.glob("*.json")):
try:
data = json.loads(gf.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
continue
for name in _golden_names(data):
expected += 1
if normalize_name(name) in found:
hits += 1
if expected == 0:
return None
return {"expected": expected, "found": hits, "recall": round(hits / expected, 3)}
# --------------------------------------------------------------------------
# load + validate + excerpt-check the extraction files
# --------------------------------------------------------------------------
def collect_activities(
extracted_dir: Path,
chunks_dir: Path,
sources_dir: Path,
schema: dict,
) -> dict:
"""Validate, excerpt-check and convert every extraction file."""
rejected_dir = extracted_dir / "_rejected"
activities: list[Activity] = []
report = {
"files_total": 0,
"files_valid": 0,
"files_rejected_schema": 0,
"activities_raw": 0,
"activities_hallucinated": 0,
"category_fallbacks": [],
}
raw_categories: list[tuple[str, str]] = []
from import_common import chunk_key_for # local import to avoid clutter
for json_path in iter_extraction_files(extracted_dir):
report["files_total"] += 1
try:
data = json.loads(json_path.read_text(encoding="utf-8"))
except json.JSONDecodeError as exc:
_reject_file(json_path, rejected_dir, [f"invalid JSON: {exc}"])
report["files_rejected_schema"] += 1
continue
from import_common import validate_extraction
errors = validate_extraction(data, schema)
if errors:
_reject_file(json_path, rejected_dir, errors)
report["files_rejected_schema"] += 1
continue
report["files_valid"] += 1
header = data.get("header", {})
chunk_text = find_chunk_text(json_path, header, chunks_dir)
source_id = header.get("source_id") or chunk_key_for(json_path, header).rsplit(
".part", 1
)[0]
fallback_source = (
source_path_for(source_id, sources_dir) or source_id or json_path.stem
)
hallucinated: list[dict] = []
for adict in data.get("activities", []):
report["activities_raw"] += 1
excerpt = adict.get("source_excerpt") or ""
# if the chunk text is unavailable we cannot verify — keep but the
# QA report still counts it under activities_raw.
if chunk_text is not None and not excerpt_matches(excerpt, chunk_text):
hallucinated.append(adict)
report["activities_hallucinated"] += 1
continue
src = adict.get("source_file") or fallback_source
raw_categories.append((adict.get("category", ""), normalize_category(adict.get("category", ""))))
activities.append(dict_to_activity(adict, src))
if hallucinated:
_log_hallucinations(json_path, rejected_dir, hallucinated)
report["category_fallbacks"] = log_category_fallbacks(raw_categories)
report["activities"] = activities
return report
def _reject_file(json_path: Path, rejected_dir: Path, errors: list[str]) -> None:
rejected_dir.mkdir(parents=True, exist_ok=True)
dest = rejected_dir / json_path.name
shutil.move(str(json_path), str(dest))
log = rejected_dir / f"{json_path.stem}.errors.txt"
log.write_text(
f"REJECTED (schema validation): {json_path.name}\n\n"
+ "\n".join(f" - {e}" for e in errors)
+ "\n",
encoding="utf-8",
)
def _log_hallucinations(
json_path: Path, rejected_dir: Path, hallucinated: list[dict]
) -> None:
rejected_dir.mkdir(parents=True, exist_ok=True)
log = rejected_dir / f"{json_path.stem}.hallucinations.txt"
lines = [f"DROPPED activities (source_excerpt not found in chunk): {json_path.name}", ""]
for a in hallucinated:
lines.append(f" - {a.get('name')!r}")
lines.append(f" excerpt: {a.get('source_excerpt')!r}")
log.write_text("\n".join(lines) + "\n", encoding="utf-8")
# --------------------------------------------------------------------------
# DB write + atomic swap
# --------------------------------------------------------------------------
def _enrich_category_display_names(db_path: Path) -> None:
"""Give the categories table proper Romanian display names for slugs."""
import sqlite3
conn = sqlite3.connect(db_path)
try:
rows = conn.execute(
"SELECT value FROM categories WHERE type = 'category'"
).fetchall()
for (slug,) in rows:
conn.execute(
"UPDATE categories SET display_name = ? WHERE type='category' AND value = ?",
(category_display_name(slug), slug),
)
conn.commit()
finally:
conn.close()
def write_database(db_tmp_path: Path, activities: list[Activity]) -> None:
"""Create a fresh tmp DB, bulk insert, populate categories, rebuild FTS."""
if db_tmp_path.exists():
db_tmp_path.unlink()
db = DatabaseManager(str(db_tmp_path))
db.bulk_insert_activities(activities)
_enrich_category_display_names(db_tmp_path)
db.rebuild_fts_index()
def atomic_swap(db_tmp_path: Path, db_path: Path) -> Optional[Path]:
"""Back up the live DB then atomically swap the tmp file in."""
backup: Optional[Path] = None
if db_path.exists():
backup = db_path.with_suffix(db_path.suffix + ".bak")
shutil.copy2(db_path, backup)
os.replace(db_tmp_path, db_path)
return backup
# --------------------------------------------------------------------------
# orchestration
# --------------------------------------------------------------------------
def rebuild(
*,
extracted_dir: Path,
chunks_dir: Path,
sources_dir: Path,
db_path: Path,
decisions_path: Optional[Path] = None,
schema_path: Path = DEFAULT_SCHEMA_PATH,
golden_dir: Optional[Path] = None,
do_swap: bool = True,
) -> dict:
"""
Full rebuild. Everything is built into <db_path>.tmp; the live DB is only
touched by the final atomic swap, so a crash anywhere above leaves it intact.
"""
extracted_dir = Path(extracted_dir)
db_path = Path(db_path)
db_tmp_path = db_path.with_suffix(db_path.suffix + ".tmp")
schema = load_schema(schema_path)
collected = collect_activities(extracted_dir, Path(chunks_dir), Path(sources_dir), schema)
activities: list[Activity] = collected.pop("activities")
deduped, dedup_stats = dedup_activities(activities)
decisions = load_review_decisions(Path(decisions_path)) if decisions_path else {}
final, decision_stats = apply_review_decisions(deduped, decisions)
try:
write_database(db_tmp_path, final)
backup = atomic_swap(db_tmp_path, db_path) if do_swap else None
except Exception:
if db_tmp_path.exists():
db_tmp_path.unlink()
raise
report = {
**collected,
"dedup": dedup_stats,
"decisions": decision_stats,
"final_count": len(final),
"backup": str(backup) if backup else None,
"swapped": do_swap,
"qa": _qa_report(final, collected, golden_dir),
}
return report
def _qa_report(
activities: list[Activity], collected: dict, golden_dir: Optional[Path]
) -> dict:
per_category: dict[str, int] = defaultdict(int)
per_content_type: dict[str, int] = defaultdict(int)
confidence: dict[str, int] = defaultdict(int)
with_rules = 0
for a in activities:
per_category[a.category] += 1
per_content_type[a.content_type or "?"] += 1
confidence[a.extraction_confidence or "?"] += 1
if a.rules and a.rules.strip():
with_rules += 1
raw = collected.get("activities_raw", 0)
hallucinated = collected.get("activities_hallucinated", 0)
return {
"total": len(activities),
"per_category": dict(per_category),
"per_content_type": dict(per_content_type),
"extraction_confidence": dict(confidence),
"pct_with_rules": round(100 * with_rules / len(activities), 1) if activities else 0.0,
"needs_review": sum(1 for a in activities if a.needs_review),
"hallucination_rate": round(100 * hallucinated / raw, 2) if raw else 0.0,
"golden_recall": golden_recall(Path(golden_dir), activities) if golden_dir else None,
}
def print_report(report: dict) -> None:
qa = report["qa"]
print("=" * 60)
print("BUILD DATABASE — QA REPORT")
print("=" * 60)
print(f"extraction files : {report['files_total']} "
f"(valid {report['files_valid']}, schema-rejected {report['files_rejected_schema']})")
print(f"activities raw : {report['activities_raw']}")
print(f" hallucinated drop : {report['activities_hallucinated']} "
f"({qa['hallucination_rate']}%)")
d = report["dedup"]
print(f"dedup : {d['input']} -> {d['output']} "
f"(auto-merged {d['auto_merged']}, borderline {d['borderline']})")
print(f"review decisions : dropped {report['decisions']['dropped']}, "
f"resolved {report['decisions']['resolved']}")
print(f"final inserted : {report['final_count']}")
print(f"% with rules : {qa['pct_with_rules']}")
print(f"needs_review rows : {qa['needs_review']}")
print("per category :")
for slug, n in sorted(qa["per_category"].items(), key=lambda kv: -kv[1]):
print(f" {slug:<24}: {n}")
print("per content_type :")
for ct, n in sorted(qa["per_content_type"].items(), key=lambda kv: -kv[1]):
print(f" {ct:<24}: {n}")
print("extraction_confidence:")
for c, n in sorted(qa["extraction_confidence"].items()):
print(f" {c:<24}: {n}")
if qa["golden_recall"]:
g = qa["golden_recall"]
print(f"golden recall : {g['found']}/{g['expected']} = {g['recall']}")
if report["category_fallbacks"]:
print("category fallbacks :")
for msg in report["category_fallbacks"]:
print(f" {msg}")
if report["backup"]:
print(f"live DB backed up to : {report['backup']}")
print("=" * 60)
# --------------------------------------------------------------------------
# CLI
# --------------------------------------------------------------------------
def main(argv: Optional[list[str]] = None) -> int:
parser = argparse.ArgumentParser(description="Build activities.db from extraction JSON.")
parser.add_argument("--rebuild", action="store_true",
help="rebuild the database from scratch (only mode supported)")
parser.add_argument("--extracted", default="data/extracted")
parser.add_argument("--chunks", default="data/chunks")
parser.add_argument("--sources", default="data/sources")
parser.add_argument("--db", default="data/activities.db")
parser.add_argument("--decisions", default="data/review_decisions.json")
parser.add_argument("--golden", default="data/golden")
parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH))
args = parser.parse_args(argv)
if not args.rebuild:
parser.error("only --rebuild is supported (full rebuild, no incremental merge)")
report = rebuild(
extracted_dir=Path(args.extracted),
chunks_dir=Path(args.chunks),
sources_dir=Path(args.sources),
db_path=Path(args.db),
decisions_path=Path(args.decisions),
schema_path=Path(args.schema),
golden_dir=Path(args.golden),
)
print_report(report)
return 0
if __name__ == "__main__":
raise SystemExit(main())

251
scripts/chunk_sources.py Normal file
View File

@@ -0,0 +1,251 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
chunk_sources.py — split normalized data/sources/*.txt into ~20-page chunks
for subagent extraction, and maintain data/chunks/manifest.json.
Paginated text → ~20-page chunks, ~4-page overlap (plan D8).
Unpaginated text → ~10000-word windows, ~2000-word overlap.
The manifest is a cache derived from the filesystem + per-chunk state. Re-running
this script is idempotent: existing chunk states (pending/assigned/done/rejected)
survive as long as the source content hash is unchanged.
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
SCRIPT_DIR = Path(__file__).resolve().parent
if str(SCRIPT_DIR) not in sys.path:
sys.path.insert(0, str(SCRIPT_DIR))
from extract_common import content_hash, split_pages # noqa: E402
SCHEMA_VERSION = "1.0"
PAGES_PER_CHUNK = 20
PAGE_OVERLAP = 4
WORD_WINDOW = 10_000
WORD_OVERLAP = 2_000
VALID_STATES = {"pending", "assigned", "done", "rejected"}
# --------------------------------------------------------------------------
# header parsing
# --------------------------------------------------------------------------
def parse_source(text: str) -> tuple[dict, str]:
"""Split a normalized source file into (header_dict, body)."""
lines = text.splitlines()
header: dict = {}
body_start = 0
in_header = True
for i, line in enumerate(lines):
if line.startswith("--- PAGE "):
body_start = i
break
if not in_header:
continue
if set(line.strip()) == {"="} and line.strip():
body_start = i + 1
in_header = False # header ends at the rule line
continue
if ":" in line:
key, _, val = line.partition(":")
header[key.strip()] = val.strip()
body = "\n".join(lines[body_start:])
return header, body
# --------------------------------------------------------------------------
# chunking — pure functions
# --------------------------------------------------------------------------
def chunk_pages(
pages: list[tuple[int, str]],
pages_per_chunk: int = PAGES_PER_CHUNK,
overlap: int = PAGE_OVERLAP,
) -> list[dict]:
"""
Split an ordered list of (page_no, text) into overlapping chunks.
stride = pages_per_chunk - overlap. Because stride < pages_per_chunk - 1, any
activity straddling a page boundary appears whole in at least one chunk.
"""
if not pages:
return []
stride = max(1, pages_per_chunk - overlap)
chunks: list[dict] = []
i = 0
n = len(pages)
while i < n:
window = pages[i : i + pages_per_chunk]
first, last = window[0][0], window[-1][0]
text = "".join(
f"\n--- PAGE {num} ---\n{txt}\n" for num, txt in window
)
chunks.append(
{"page_start": first, "page_end": last,
"chunk_range": f"pages {first}-{last}", "text": text}
)
if i + pages_per_chunk >= n:
break
i += stride
return chunks
def chunk_words(
text: str, window: int = WORD_WINDOW, overlap: int = WORD_OVERLAP
) -> list[dict]:
"""Split unpaginated text into overlapping word windows."""
words = text.split()
if not words:
return []
stride = max(1, window - overlap)
chunks: list[dict] = []
i = 0
n = len(words)
while i < n:
seg = words[i : i + window]
chunks.append(
{"word_start": i, "word_end": i + len(seg),
"chunk_range": f"words {i}-{i + len(seg)}", "text": " ".join(seg)}
)
if i + window >= n:
break
i += stride
return chunks
def make_chunks(source_text: str) -> list[dict]:
"""Chunk one normalized source file. Picks page- or word-windowing."""
_, body = parse_source(source_text)
pages = split_pages(body)
if pages:
return chunk_pages(pages)
return chunk_words(body)
# --------------------------------------------------------------------------
# manifest
# --------------------------------------------------------------------------
def _empty_manifest() -> dict:
return {"schema_version": SCHEMA_VERSION, "chunks": {}}
def load_manifest(manifest_path: Path) -> dict:
if manifest_path.exists():
try:
data = json.loads(manifest_path.read_text(encoding="utf-8"))
data.setdefault("schema_version", SCHEMA_VERSION)
data.setdefault("chunks", {})
return data
except (json.JSONDecodeError, OSError):
pass
return _empty_manifest()
def save_manifest(manifest: dict, manifest_path: Path) -> None:
manifest_path.parent.mkdir(parents=True, exist_ok=True)
manifest_path.write_text(
json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8"
)
def chunk_source_file(
source_path: Path, chunks_dir: Path, manifest: dict
) -> list[str]:
"""
Chunk one data/sources/<id>.txt → data/chunks/<id>/<id>.partNN.txt and
register every chunk in `manifest`. Preserves prior state when the source
content hash is unchanged. Returns the list of chunk keys written.
"""
source_id = source_path.stem
text = source_path.read_text(encoding="utf-8", errors="replace")
src_hash = content_hash(text)
chunks = make_chunks(text)
out_dir = chunks_dir / source_id
out_dir.mkdir(parents=True, exist_ok=True)
written: list[str] = []
for idx, chunk in enumerate(chunks, 1):
key = f"{source_id}.part{idx:02d}"
chunk_file = out_dir / f"{key}.txt"
chunk_file.write_text(chunk["text"], encoding="utf-8")
prior = manifest["chunks"].get(key)
# preserve state only if the source content is unchanged
if prior and prior.get("source_hash") == src_hash and \
prior.get("state") in VALID_STATES:
state = prior["state"]
else:
state = "pending"
manifest["chunks"][key] = {
"source_id": source_id,
"source_hash": src_hash,
"part": idx,
"chunk_range": chunk["chunk_range"],
"chunk_file": str(chunk_file.relative_to(chunks_dir.parent)),
"expected_json": f"{key}.json",
"state": state,
}
written.append(key)
return written
def prune_stale(manifest: dict, live_keys: set[str]) -> list[str]:
"""Drop manifest entries whose chunk no longer exists on disk."""
stale = [k for k in manifest["chunks"] if k not in live_keys]
for k in stale:
del manifest["chunks"][k]
return stale
# --------------------------------------------------------------------------
# CLI
# --------------------------------------------------------------------------
def run(sources_dir: Path, chunks_dir: Path) -> dict:
"""Chunk every *.txt in sources_dir. Returns a summary dict."""
manifest_path = chunks_dir / "manifest.json"
manifest = load_manifest(manifest_path)
live_keys: set[str] = set()
source_files = sorted(sources_dir.glob("*.txt"))
for src in source_files:
live_keys.update(chunk_source_file(src, chunks_dir, manifest))
stale = prune_stale(manifest, live_keys)
save_manifest(manifest, manifest_path)
states: dict[str, int] = {}
for meta in manifest["chunks"].values():
states[meta["state"]] = states.get(meta["state"], 0) + 1
return {
"sources": len(source_files),
"chunks": len(live_keys),
"pruned": len(stale),
"states": states,
}
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Chunk normalized sources.")
parser.add_argument("--sources", default="data/sources", help="sources dir")
parser.add_argument("--chunks", default="data/chunks", help="chunks output dir")
args = parser.parse_args(argv)
summary = run(Path(args.sources), Path(args.chunks))
print(f"sources processed : {summary['sources']}")
print(f"chunks written : {summary['chunks']}")
print(f"stale pruned : {summary['pruned']}")
for state, count in sorted(summary["states"].items()):
print(f" {state:<10}: {count}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -1,54 +0,0 @@
# TEMPLATE PENTRU EXTRACȚIE ACTIVITĂȚI CU CLAUDE
## Instrucțiuni pentru Claude Code:
Pentru fiecare PDF/DOC, folosește următorul format de extracție:
### 1. Citește fișierul:
```
Claude, te rog citește fișierul: [CALE_FISIER]
```
### 2. Extrage activitățile folosind acest template JSON:
```json
{
"source_file": "[NUME_FISIER]",
"activities": [
{
"name": "Numele activității",
"description": "Descrierea completă a activității",
"rules": "Regulile jocului/activității",
"variations": "Variante sau adaptări",
"category": "[A-H] bazat pe tip",
"age_group_min": 6,
"age_group_max": 14,
"participants_min": 4,
"participants_max": 20,
"duration_min": 10,
"duration_max": 30,
"materials_list": "Lista materialelor necesare",
"skills_developed": "Competențe dezvoltate",
"difficulty_level": "Ușor/Mediu/Dificil",
"keywords": "cuvinte cheie separate prin virgulă",
"tags": "taguri relevante"
}
]
}
```
### 3. Salvează în fișier:
După extracție, salvează JSON-ul în: `/scripts/extracted_activities/[NUME_FISIER].json`
### 4. Priorități de procesare:
**TOP PRIORITY (procesează primele):**
1. 1000 Fantastic Scout Games.pdf
2. Cartea Mare a jocurilor.pdf
3. 160-de-activitati-dinamice-jocuri-pentru-team-building.pdf
4. 101 Ways to Create an Unforgettable Camp Experience.pdf
5. 151 Awesome Summer Camp Nature Activities.pdf
**Categorii de focus:**
- [A] Jocuri Cercetășești
- [C] Camping & Activități Exterior
- [G] Activități Educaționale

View File

@@ -1,164 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
DATABASE SETUP SCRIPT - INDEX-SISTEM-JOCURI
Script pentru recrearea bazelor de date din .gitignore
Folosește clasele DatabaseManager pentru consistență
Usage:
python scripts/create_databases.py
python scripts/create_databases.py --clear-existing
"""
import sys
import argparse
from pathlib import Path
# Add src to path so we can import our modules
sys.path.append(str(Path(__file__).parent.parent / 'src'))
from database import DatabaseManager
from game_library_manager import GameLibraryManager
def create_main_database(db_path: str = "data/activities.db", clear: bool = False):
"""Create the main activities database"""
db_file = Path(db_path)
if clear and db_file.exists():
print(f"🗑️ Removing existing database: {db_path}")
db_file.unlink()
print(f"📊 Creating main database: {db_path}")
db = DatabaseManager(db_path)
# Test the database
try:
stats = db.get_statistics()
print(f"✅ Database created successfully: {stats['total_activities']} activities")
return True
except Exception as e:
print(f"❌ Error creating database: {e}")
return False
def create_game_library_database(db_path: str = "data/game_library.db", clear: bool = False):
"""Create the legacy game library database"""
db_file = Path(db_path)
if clear and db_file.exists():
print(f"🗑️ Removing existing database: {db_path}")
db_file.unlink()
print(f"📊 Creating game library database: {db_path}")
manager = GameLibraryManager(db_path)
print(f"✅ Game library database created successfully")
return True
def create_test_database(db_path: str = "data/test_activities.db", clear: bool = False):
"""Create the test database"""
db_file = Path(db_path)
if clear and db_file.exists():
print(f"🗑️ Removing existing database: {db_path}")
db_file.unlink()
print(f"📊 Creating test database: {db_path}")
db = DatabaseManager(db_path)
# Add some test data
test_activity = {
'title': 'Test Activity - Setup Script',
'description': 'This is a test activity created by the setup script',
'file_path': 'test/sample.txt',
'file_type': 'TXT',
'category': 'test',
'age_group': '8-12 ani',
'participants': '5-10 persoane',
'duration': '15-30min',
'materials': 'Fără materiale',
'tags': '["test", "setup"]',
'source_text': 'Sample test content for verification'
}
try:
db.insert_activity(test_activity)
stats = db.get_statistics()
print(f"✅ Test database created with sample data: {stats['total_activities']} activities")
return True
except Exception as e:
print(f"❌ Error creating test database: {e}")
return False
def ensure_data_directory():
"""Ensure the data directory exists"""
data_dir = Path("data")
if not data_dir.exists():
print(f"📁 Creating data directory: {data_dir}")
data_dir.mkdir(parents=True)
else:
print(f"📁 Data directory exists: {data_dir}")
def main():
"""Main setup function"""
parser = argparse.ArgumentParser(description='Create databases for INDEX-SISTEM-JOCURI')
parser.add_argument('--clear-existing', '-c', action='store_true',
help='Remove existing databases before creating new ones')
parser.add_argument('--main-only', action='store_true',
help='Create only the main activities database')
parser.add_argument('--test-only', action='store_true',
help='Create only the test database')
args = parser.parse_args()
print("🚀 DATABASE SETUP - INDEX-SISTEM-JOCURI")
print("=" * 50)
# Ensure data directory exists
ensure_data_directory()
success_count = 0
total_count = 0
if args.test_only:
total_count = 1
if create_test_database(clear=args.clear_existing):
success_count += 1
elif args.main_only:
total_count = 1
if create_main_database(clear=args.clear_existing):
success_count += 1
else:
# Create all databases
databases = [
("Main activities", lambda: create_main_database(clear=args.clear_existing)),
("Game library", lambda: create_game_library_database(clear=args.clear_existing)),
("Test activities", lambda: create_test_database(clear=args.clear_existing))
]
total_count = len(databases)
for name, create_func in databases:
print(f"\n📂 Creating {name} database...")
try:
if create_func():
success_count += 1
except Exception as e:
print(f"❌ Failed to create {name} database: {e}")
print("\n" + "=" * 50)
print(f"🎯 SUMMARY: {success_count}/{total_count} databases created successfully")
if success_count == total_count:
print("✅ All databases ready!")
print("\nNext steps:")
print("1. Run indexer: cd src && python indexer.py --clear-db")
print("2. Start web app: cd src && python app.py")
else:
print("⚠️ Some databases failed to create. Check errors above.")
return 1
return 0
if __name__ == '__main__':
sys.exit(main())

361
scripts/extract_common.py Normal file
View File

@@ -0,0 +1,361 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
extract_common.py — single home for per-format text extraction.
Every extractor returns a plain text *body* with synthetic page markers
(`--- PAGE N ---`). The file-level header (`SOURCE:` / `CONVERTED:`) is added
by normalize_sources.py, not here.
Critical fix vs. the old pdf_to_text_converter.py: there is NO `max_pages` cap.
Large books are extracted in full.
"""
from __future__ import annotations
import hashlib
import importlib
import os
import re
import shutil
import subprocess
import tempfile
import zipfile
from pathlib import Path
from typing import Callable
PAGE_MARKER_RE = re.compile(r"^--- PAGE (\d+) ---\s*$", re.MULTILINE)
# paragraphs per synthetic page for paginated-by-flow formats (docx)
DOCX_PARAS_PER_PAGE = 40
# formats we deliberately ignore (epub duplicates existing PDFs — plan §1)
IGNORED_EXTENSIONS = {".epub"}
# obvious junk filenames skipped during a walk
JUNK_NAMES = {"desktop.ini", "linkuri-jocuri.txt"}
JUNK_SUFFIXES = {".bak", ".tmp", ".ini"}
# --------------------------------------------------------------------------
# page assembly helpers
# --------------------------------------------------------------------------
def join_pages(pages: list[str], start: int = 1) -> str:
"""Join a list of page texts into a body string with `--- PAGE N ---`."""
out: list[str] = []
for i, text in enumerate(pages, start):
out.append(f"\n--- PAGE {i} ---\n{(text or '').strip()}\n")
return "".join(out)
def split_pages(body: str) -> list[tuple[int, str]]:
"""Inverse of join_pages: parse a body into [(page_number, text), ...]."""
matches = list(PAGE_MARKER_RE.finditer(body))
if not matches:
return []
pages: list[tuple[int, str]] = []
for idx, m in enumerate(matches):
num = int(m.group(1))
seg_start = m.end()
seg_end = matches[idx + 1].start() if idx + 1 < len(matches) else len(body)
pages.append((num, body[seg_start:seg_end].strip()))
return pages
def count_page_markers(body: str) -> int:
return len(PAGE_MARKER_RE.findall(body))
# --------------------------------------------------------------------------
# format detection
# --------------------------------------------------------------------------
FORMAT_BY_EXT = {
".pdf": "pdf",
".docx": "docx",
".doc": "doc",
".pptx": "pptx",
".ppt": "pptx",
".htm": "html",
".html": "html",
".zip": "zip",
".epub": "epub",
".txt": "txt",
}
def detect_format(path: str | os.PathLike) -> str:
"""Return a format key for a path based on its extension."""
ext = Path(path).suffix.lower()
return FORMAT_BY_EXT.get(ext, "unknown")
def is_junk(path: str | os.PathLike) -> bool:
p = Path(path)
name = p.name.lower()
if name in JUNK_NAMES:
return True
if name.startswith("readme") and p.suffix.lower() == ".md":
return True
if p.suffix.lower() in JUNK_SUFFIXES:
return True
return False
# --------------------------------------------------------------------------
# content hashing + near-duplicate elimination
# --------------------------------------------------------------------------
def _normalize_for_hash(text: str) -> str:
return re.sub(r"\s+", " ", (text or "")).strip().lower()
def content_hash(text: str) -> str:
"""Stable SHA1 of whitespace-normalized text — used for exact-dup detection."""
return hashlib.sha1(_normalize_for_hash(text).encode("utf-8")).hexdigest()
def near_duplicate_ratio(a: str, b: str) -> float:
"""Similarity score in [0, 100] between two texts (rapidfuzz token ratio)."""
from rapidfuzz import fuzz
return fuzz.token_sort_ratio(_normalize_for_hash(a), _normalize_for_hash(b))
def dedupe_texts(
items: list[tuple[str, str]], threshold: float = 95.0
) -> list[tuple[str, str]]:
"""
Drop exact and near-duplicate texts from a list of (key, text) pairs.
Used for HTML mirror pages (print copies, repeated index/footer pages).
Keeps the first occurrence; O(n) on exact hash, O(n*k) fuzzy only against
already-kept items.
"""
kept: list[tuple[str, str]] = []
seen_hashes: set[str] = set()
for key, text in items:
h = content_hash(text)
if h in seen_hashes:
continue
if any(near_duplicate_ratio(text, kt) >= threshold for _, kt in kept):
continue
seen_hashes.add(h)
kept.append((key, text))
return kept
# --------------------------------------------------------------------------
# preflight dependency check
# --------------------------------------------------------------------------
REQUIRED_PYTHON_MODULES = {
"pdfplumber": "pdfplumber",
"PyPDF2": "pypdf2",
"docx": "python-docx",
"pptx": "python-pptx",
"bs4": "beautifulsoup4",
"lxml": "lxml",
"jsonschema": "jsonschema",
"rapidfuzz": "rapidfuzz",
"chardet": "chardet",
}
def preflight(check_ocr: bool = False) -> dict:
"""
Check system + Python dependencies before a long normalization run.
Returns {'ok': bool, 'missing_python': [...], 'missing_system': [...],
'warnings': [...]}. libreoffice is a *warning* (only .doc needs it),
tesseract only checked when check_ocr=True.
"""
missing_python: list[str] = []
for module, pip_name in REQUIRED_PYTHON_MODULES.items():
try:
importlib.import_module(module)
except ImportError:
missing_python.append(pip_name)
warnings: list[str] = []
missing_system: list[str] = []
if not (shutil.which("libreoffice") or shutil.which("soffice")):
warnings.append("libreoffice not found — legacy .doc files cannot be converted")
if check_ocr and not shutil.which("tesseract"):
missing_system.append("tesseract (OCR requested but not installed)")
return {
"ok": not missing_python and not missing_system,
"missing_python": missing_python,
"missing_system": missing_system,
"warnings": warnings,
}
# --------------------------------------------------------------------------
# per-format extractors
# --------------------------------------------------------------------------
def extract_pdf(path: str | os.PathLike) -> str:
"""PDF → body. pdfplumber primary, PyPDF2 fallback. No page cap."""
path = str(path)
try:
return _extract_pdf_pdfplumber(path)
except Exception:
return _extract_pdf_pypdf2(path)
def _extract_pdf_pdfplumber(path: str) -> str:
import pdfplumber
pages: list[str] = []
with pdfplumber.open(path) as pdf:
for page in pdf.pages: # ALL pages — no max_pages
try:
pages.append(page.extract_text() or "")
except Exception:
pages.append("")
return join_pages(pages)
def _extract_pdf_pypdf2(path: str) -> str:
import PyPDF2
pages: list[str] = []
with open(path, "rb") as fh:
reader = PyPDF2.PdfReader(fh)
for page in reader.pages: # ALL pages — no max_pages
try:
pages.append(page.extract_text() or "")
except Exception:
pages.append("")
return join_pages(pages)
def extract_docx(path: str | os.PathLike) -> str:
"""docx → body. Synthetic page marker every DOCX_PARAS_PER_PAGE paragraphs."""
import docx
document = docx.Document(str(path))
paragraphs = [p.text for p in document.paragraphs]
pages: list[str] = []
for i in range(0, max(len(paragraphs), 1), DOCX_PARAS_PER_PAGE):
chunk = paragraphs[i : i + DOCX_PARAS_PER_PAGE]
pages.append("\n".join(chunk))
return join_pages(pages)
def extract_doc(path: str | os.PathLike) -> str:
"""
Legacy .doc → body via `libreoffice --headless --convert-to docx`.
Raises RuntimeError if libreoffice is unavailable — the caller marks the
resulting source `needs_review` regardless (conversion is imperfect).
"""
soffice = shutil.which("libreoffice") or shutil.which("soffice")
if not soffice:
raise RuntimeError("libreoffice/soffice not available — cannot convert .doc")
src = Path(path).resolve()
with tempfile.TemporaryDirectory() as tmp:
subprocess.run(
[soffice, "--headless", "--convert-to", "docx", "--outdir", tmp, str(src)],
check=True,
capture_output=True,
timeout=300,
)
converted = Path(tmp) / (src.stem + ".docx")
if not converted.exists():
raise RuntimeError(f"libreoffice produced no output for {src.name}")
return extract_docx(converted)
def extract_pptx(path: str | os.PathLike) -> str:
"""pptx → body. One page per slide: title + body text + speaker notes."""
from pptx import Presentation
presentation = Presentation(str(path))
pages: list[str] = []
for slide in presentation.slides:
parts: list[str] = []
for shape in slide.shapes:
if shape.has_text_frame and shape.text_frame.text.strip():
parts.append(shape.text_frame.text.strip())
if slide.has_notes_slide:
notes = slide.notes_slide.notes_text_frame.text.strip()
if notes:
parts.append(f"[NOTES] {notes}")
pages.append("\n".join(parts))
return join_pages(pages)
def extract_html(path: str | os.PathLike) -> str:
"""HTML mirror page → body. Strips nav/script/style/footer/header/aside."""
import chardet
from bs4 import BeautifulSoup
raw = Path(path).read_bytes()
enc = chardet.detect(raw).get("encoding") or "utf-8"
soup = BeautifulSoup(raw.decode(enc, errors="replace"), "lxml")
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
tag.decompose()
# also drop common chrome by role/class
for tag in soup.find_all(attrs={"role": ["navigation", "banner", "contentinfo"]}):
tag.decompose()
text = soup.get_text(separator="\n")
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
return join_pages(["\n".join(lines)])
def extract_zip(path: str | os.PathLike) -> str:
"""
zip → body. Unzips into a temp dir and recurses on every extractable inner
file. Inner files are page-renumbered into one continuous body.
"""
path = str(path)
pages: list[str] = []
with tempfile.TemporaryDirectory() as tmp:
try:
with zipfile.ZipFile(path) as zf:
zf.extractall(tmp)
except zipfile.BadZipFile:
return ""
for inner in sorted(Path(tmp).rglob("*")):
if not inner.is_file() or is_junk(inner):
continue
fmt = detect_format(inner)
if fmt in ("unknown", "epub", "zip"):
# nested zips handled by recursion below
if fmt == "zip":
body = extract_zip(inner)
pages.extend(t for _, t in split_pages(body))
continue
try:
body = extract_file(inner)
except Exception:
continue
pages.extend(t for _, t in split_pages(body))
return join_pages(pages)
EXTRACTORS: dict[str, Callable[[str | os.PathLike], str]] = {
"pdf": extract_pdf,
"docx": extract_docx,
"doc": extract_doc,
"pptx": extract_pptx,
"html": extract_html,
"zip": extract_zip,
}
def extract_file(path: str | os.PathLike) -> str:
"""Dispatch a single file to the right extractor. Returns a page-marked body."""
fmt = detect_format(path)
if fmt == "txt":
body = Path(path).read_text(encoding="utf-8", errors="replace")
# already paginated? pass through; else wrap as one page
return body if count_page_markers(body) else join_pages([body])
extractor = EXTRACTORS.get(fmt)
if extractor is None:
raise ValueError(f"No extractor for format '{fmt}': {path}")
return extractor(path)

View File

@@ -1,424 +0,0 @@
#!/usr/bin/env python3
"""
HTML Activity Extractor - Proceseaz 1876 fiiere HTML
Extrage automat activiti folosind pattern recognition
"""
import os
import re
import json
from pathlib import Path
from bs4 import BeautifulSoup
import chardet
from typing import List, Dict, Optional
import sqlite3
from datetime import datetime
class HTMLActivityExtractor:
def __init__(self, db_path='data/activities.db'):
self.db_path = db_path
# Pattern-uri pentru detectare activiti <20>n rom<6F>n
self.activity_patterns = {
'title_patterns': [
r'(?i)(joc|activitate|exerci[t]iu|team[\s-]?building|energizer|ice[\s-]?breaker)[\s:]+([^\.]{5,100})',
r'(?i)<h[1-6][^>]*>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</h[1-6]>',
r'(?i)<strong>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</strong>',
r'(?i)^[\d]+\.?\s*([A-Z][^\.]{10,100}(?:joc|activitate|exerci[t]iu)[^\.]{0,50})$',
],
'description_markers': [
'descriere', 'reguli', 'cum se joac[a]', 'instructiuni',
'obiectiv', 'desfasurare', 'explicatie', 'mod de joc'
],
'materials_markers': [
'materiale', 'necesare', 'echipament', 'ce avem nevoie',
'se folosesc', 'trebuie sa avem', 'dotari'
],
'age_patterns': [
r'(?i)v[<5B>a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
r'(?i)(\d+)[\s-]+(\d+)\s*ani',
r'(?i)pentru\s+(\d+)[\s-]+(\d+)\s*ani',
r'(?i)categoria?\s*(?:de\s*)?v[<5B>a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
],
'participants_patterns': [
r'(?i)(\d+)[\s-]+(\d+)\s*(?:participan[t]i|juc[a]tori|persoane|copii)',
r'(?i)num[a]r\s*(?:de\s*)?(?:participan[t]i|juc[a]tori)[\s:]+(\d+)[\s-]+(\d+)',
r'(?i)grup\s*de\s*(\d+)[\s-]+(\d+)',
],
'duration_patterns': [
r'(?i)durat[a][\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
r'(?i)timp[\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
r'(?i)(\d+)[\s-]+(\d+)\s*minute',
]
}
# Categorii predefinite bazate pe sistemul existent
self.categories = {
'[A]': ['joc', 'joaca', 'distractie', 'amuzament'],
'[B]': ['aventura', 'explorare', 'descoperire'],
'[C]': ['camping', 'tabara', 'excursie', 'drumetie'],
'[D]': ['foc', 'flacara', 'lumina'],
'[E]': ['noduri', 'fr<EFBFBD>nghii', 'sfori', 'legare'],
'[F]': ['bushcraft', 'supravietuire', 'survival'],
'[G]': ['educatie', 'educativ', 'invatare', 'scoala'],
'[H]': ['orientare', 'busola', 'harta', 'navigare']
}
def detect_encoding(self, file_path):
"""Detecteaz encoding-ul fiierului"""
with open(file_path, 'rb') as f:
result = chardet.detect(f.read())
return result['encoding'] or 'utf-8'
def extract_from_html(self, html_path: str) -> List[Dict]:
"""Extrage activiti dintr-un singur fiier HTML"""
activities = []
try:
# Detectare encoding i citire
encoding = self.detect_encoding(html_path)
with open(html_path, 'r', encoding=encoding, errors='ignore') as f:
content = f.read()
soup = BeautifulSoup(content, 'lxml')
# Metod 1: Caut liste de activiti
activities.extend(self._extract_from_lists(soup, html_path))
# Metod 2: Caut activiti <20>n headings
activities.extend(self._extract_from_headings(soup, html_path))
# Metod 3: Caut pattern-uri <20>n text
activities.extend(self._extract_from_patterns(soup, html_path))
# Metod 4: Caut <20>n tabele
activities.extend(self._extract_from_tables(soup, html_path))
except Exception as e:
print(f"Error processing {html_path}: {e}")
return activities
def _extract_from_lists(self, soup, source_file):
"""Extrage activiti din liste HTML (ul, ol)"""
activities = []
for list_elem in soup.find_all(['ul', 'ol']):
# Verific dac lista pare s conin activiti
list_text = list_elem.get_text().lower()
if any(marker in list_text for marker in ['joc', 'activitate', 'exercitiu']):
for li in list_elem.find_all('li'):
text = li.get_text(strip=True)
if len(text) > 20: # Minim 20 caractere pentru o activitate valid
activity = self._create_activity_from_text(text, source_file)
if activity:
activities.append(activity)
return activities
def _extract_from_headings(self, soup, source_file):
"""Extrage activiti bazate pe headings"""
activities = []
for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
heading_text = heading.get_text(strip=True)
# Verific dac heading-ul conine cuvinte cheie
if any(keyword in heading_text.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
# Caut descrierea <20>n elementele urmtoare
description = ""
next_elem = heading.find_next_sibling()
while next_elem and next_elem.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
if next_elem.name in ['p', 'div', 'ul']:
description += next_elem.get_text(strip=True) + " "
if len(description) > 500: # Limit descriere
break
next_elem = next_elem.find_next_sibling()
if description:
activity = {
'name': heading_text[:200],
'description': description[:1000],
'source_file': str(source_file),
'category': self._detect_category(heading_text + " " + description)
}
activities.append(activity)
return activities
def _extract_from_patterns(self, soup, source_file):
"""Extrage activiti folosind pattern matching"""
activities = []
text = soup.get_text()
# Caut pattern-uri de activiti
for pattern in self.activity_patterns['title_patterns']:
matches = re.finditer(pattern, text, re.MULTILINE)
for match in matches:
title = match.group(0) if match.lastindex == 0 else match.group(match.lastindex)
if len(title) > 10:
# Extrage context <20>n jurul match-ului
start = max(0, match.start() - 200)
end = min(len(text), match.end() + 500)
context = text[start:end]
activity = self._create_activity_from_text(context, source_file, title)
if activity:
activities.append(activity)
return activities
def _extract_from_tables(self, soup, source_file):
"""Extrage activiti din tabele"""
activities = []
for table in soup.find_all('table'):
rows = table.find_all('tr')
if len(rows) > 1: # Cel puin header i o linie de date
# Detecteaz coloanele relevante
headers = [th.get_text(strip=True).lower() for th in rows[0].find_all(['th', 'td'])]
for row in rows[1:]:
cells = row.find_all(['td'])
if cells:
activity_data = {}
for i, cell in enumerate(cells):
if i < len(headers):
activity_data[headers[i]] = cell.get_text(strip=True)
# Creeaz activitate din date tabel
if any(key in activity_data for key in ['joc', 'activitate', 'nume', 'titlu']):
activity = self._create_activity_from_table_data(activity_data, source_file)
if activity:
activities.append(activity)
return activities
def _create_activity_from_text(self, text, source_file, title=None):
"""Creeaz un dicionar de activitate din text"""
if not text or len(text) < 30:
return None
activity = {
'name': title or text[:100].split('.')[0].strip(),
'description': text[:1000],
'source_file': str(source_file),
'category': self._detect_category(text),
'keywords': self._extract_keywords(text),
'created_at': datetime.now().isoformat()
}
# Extrage metadata suplimentar
activity.update(self._extract_metadata(text))
return activity
def _create_activity_from_table_data(self, data, source_file):
"""Creeaz activitate din date de tabel"""
activity = {
'source_file': str(source_file),
'created_at': datetime.now().isoformat()
}
# Mapare c<>mpuri tabel la c<>mpuri DB
field_mapping = {
'nume': 'name', 'titlu': 'name', 'joc': 'name', 'activitate': 'name',
'descriere': 'description', 'detalii': 'description', 'explicatie': 'description',
'materiale': 'materials_list', 'echipament': 'materials_list',
'varsta': 'age_group_min', 'categoria': 'category',
'participanti': 'participants_min', 'numar': 'participants_min',
'durata': 'duration_min', 'timp': 'duration_min'
}
for table_field, db_field in field_mapping.items():
if table_field in data:
activity[db_field] = data[table_field]
# Validare minim
if 'name' in activity and len(activity.get('name', '')) > 5:
return activity
return None
def _extract_metadata(self, text):
"""Extrage metadata din text folosind pattern-uri"""
metadata = {}
# Extrage v<>rsta
for pattern in self.activity_patterns['age_patterns']:
match = re.search(pattern, text)
if match:
metadata['age_group_min'] = int(match.group(1))
metadata['age_group_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
break
# Extrage numr participani
for pattern in self.activity_patterns['participants_patterns']:
match = re.search(pattern, text)
if match:
metadata['participants_min'] = int(match.group(1))
metadata['participants_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
break
# Extrage durata
for pattern in self.activity_patterns['duration_patterns']:
match = re.search(pattern, text)
if match:
metadata['duration_min'] = int(match.group(1))
metadata['duration_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
break
# Extrage materiale
materials = []
text_lower = text.lower()
for marker in self.activity_patterns['materials_markers']:
idx = text_lower.find(marker)
if idx != -1:
# Extrage urmtoarele 200 caractere dup marker
materials_text = text[idx:idx+200]
# Extrage items din list
items = re.findall(r'[-"]\s*([^\n-"]+)', materials_text)
if items:
materials.extend(items)
if materials:
metadata['materials_list'] = ', '.join(materials[:10]) # Maxim 10 materiale
return metadata
def _detect_category(self, text):
"""Detecteaz categoria activitii bazat pe cuvinte cheie"""
text_lower = text.lower()
for category, keywords in self.categories.items():
if any(keyword in text_lower for keyword in keywords):
return category
return '[A]' # Default categoria jocuri
def _extract_keywords(self, text):
"""Extrage cuvinte cheie din text"""
keywords = []
text_lower = text.lower()
# Lista de cuvinte cheie relevante
keyword_list = [
'cooperare', 'competitie', 'echipa', 'creativitate', 'miscare',
'strategie', 'comunicare', 'incredere', 'coordonare', 'atentie',
'reflexe', 'logica', 'imaginatie', 'muzica', 'dans', 'sport',
'natura', 'mediu', 'stiinta', 'matematica', 'limba', 'cultura'
]
for keyword in keyword_list:
if keyword in text_lower:
keywords.append(keyword)
return ', '.join(keywords[:5]) # Maxim 5 keywords
def save_to_database(self, activities):
"""Salveaz activitile <20>n baza de date"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
saved_count = 0
duplicate_count = 0
for activity in activities:
try:
# Verific duplicate
cursor.execute(
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
(activity.get('name'), activity.get('source_file'))
)
if cursor.fetchone():
duplicate_count += 1
continue
# Pregtete valorile pentru insert
columns = []
values = []
placeholders = []
for key, value in activity.items():
if key != 'created_at': # Skip created_at, it has default
columns.append(key)
values.append(value)
placeholders.append('?')
# Insert <20>n DB
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
cursor.execute(query, values)
saved_count += 1
except Exception as e:
print(f"Error saving activity: {e}")
continue
conn.commit()
conn.close()
return saved_count, duplicate_count
def process_all_html_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
"""Proceseaz toate fiierele HTML din directorul specificat"""
base_path = Path(base_path)
html_files = list(base_path.rglob("*.html"))
html_files.extend(list(base_path.rglob("*.htm")))
print(f"Found {len(html_files)} HTML files to process")
all_activities = []
processed = 0
errors = 0
for i, html_file in enumerate(html_files):
try:
activities = self.extract_from_html(str(html_file))
all_activities.extend(activities)
processed += 1
# Progress update
if (i + 1) % 100 == 0:
print(f"Progress: {i+1}/{len(html_files)} files processed, {len(all_activities)} activities found")
# Save batch to DB
if all_activities:
saved, dupes = self.save_to_database(all_activities)
print(f"Batch saved: {saved} new activities, {dupes} duplicates skipped")
all_activities = [] # Clear buffer
except Exception as e:
print(f"Error processing {html_file}: {e}")
errors += 1
# Save remaining activities
if all_activities:
saved, dupes = self.save_to_database(all_activities)
print(f"Final batch saved: {saved} new activities, {dupes} duplicates skipped")
print(f"\nProcessing complete!")
print(f"Files processed: {processed}")
print(f"Errors: {errors}")
return processed, errors
# Funcie main pentru test
if __name__ == "__main__":
extractor = HTMLActivityExtractor()
# Test pe un fiier sample mai <20>nt<6E>i
print("Testing on sample file first...")
# Gsete un fiier HTML pentru test
test_files = list(Path('/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri').rglob("*.html"))[:3]
for test_file in test_files:
print(f"\nTesting: {test_file}")
activities = extractor.extract_from_html(str(test_file))
print(f"Found {len(activities)} activities")
if activities:
print(f"Sample activity: {activities[0]['name'][:50]}...")
# <20>ntreab dac s continue cu procesarea complet
response = input("\nContinue with full processing? (y/n): ")
if response.lower() == 'y':
extractor.process_all_html_files()

View File

@@ -1,78 +0,0 @@
#!/usr/bin/env python3
"""
Import activities extracted by Claude from JSON files
"""
import json
import sqlite3
from pathlib import Path
from datetime import datetime
class ClaudeActivityImporter:
def __init__(self, db_path='data/activities.db'):
self.db_path = db_path
self.json_dir = Path('scripts/extracted_activities')
self.json_dir.mkdir(exist_ok=True)
def import_json_file(self, json_path):
"""Import activities from a single JSON file"""
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
source_file = data.get('source_file', str(json_path))
activities = data.get('activities', [])
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
imported = 0
for activity in activities:
try:
# Add source file and timestamp
activity['source_file'] = source_file
activity['created_at'] = datetime.now().isoformat()
# Prepare insert
columns = list(activity.keys())
values = list(activity.values())
placeholders = ['?' for _ in values]
# Check for duplicate
cursor.execute(
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
(activity.get('name'), source_file)
)
if not cursor.fetchone():
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
cursor.execute(query, values)
imported += 1
except Exception as e:
print(f"Error importing activity: {e}")
conn.commit()
conn.close()
print(f"Imported {imported} activities from {json_path.name}")
return imported
def import_all_json_files(self):
"""Import all JSON files from the extracted_activities directory"""
json_files = list(self.json_dir.glob("*.json"))
if not json_files:
print("No JSON files found in extracted_activities directory")
return 0
total_imported = 0
for json_file in json_files:
imported = self.import_json_file(json_file)
total_imported += imported
print(f"\nTotal imported: {total_imported} activities from {len(json_files)} files")
return total_imported
if __name__ == "__main__":
importer = ClaudeActivityImporter()
importer.import_all_json_files()

179
scripts/import_common.py Normal file
View File

@@ -0,0 +1,179 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
import_common.py — shared helpers for the import / validation side of the
extraction pipeline (Lane C).
Used by build_database.py and validate_extractions.py:
* JSON-schema validation of subagent extraction files,
* the anti-hallucination source_excerpt substring check (E5),
* locating the source chunk that an extraction file came from,
* the stable content key used by the needs_review queue.
"""
from __future__ import annotations
import hashlib
import json
import re
import unicodedata
from pathlib import Path
from typing import Any, Optional
SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
DEFAULT_SCHEMA_PATH = SCRIPT_DIR / "activity_schema.json"
# rapidfuzz.partial_ratio is on a 0..100 scale; an excerpt counts as a real
# quote from the source when it scores at least this against the chunk text.
EXCERPT_MATCH_THRESHOLD = 90.0
# --------------------------------------------------------------------------
# schema validation
# --------------------------------------------------------------------------
def load_schema(schema_path: str | Path = DEFAULT_SCHEMA_PATH) -> dict:
"""Load the activity JSON schema produced by Lane A."""
return json.loads(Path(schema_path).read_text(encoding="utf-8"))
def validate_extraction(data: Any, schema: dict) -> list[str]:
"""
Validate one parsed extraction file against `schema`.
Returns a list of human-readable error strings; empty list == valid.
"""
import jsonschema
validator = jsonschema.Draft7Validator(schema)
errors: list[str] = []
for err in sorted(validator.iter_errors(data), key=lambda e: list(e.path)):
location = "/".join(str(p) for p in err.path) or "<root>"
errors.append(f"{location}: {err.message}")
return errors
# --------------------------------------------------------------------------
# excerpt verification (E5 — anti-hallucination)
# --------------------------------------------------------------------------
def _normalize_text(text: str) -> str:
return re.sub(r"\s+", " ", (text or "")).strip().lower()
def excerpt_score(excerpt: str, chunk_text: str) -> float:
"""Best fuzzy-substring score (0..100) of `excerpt` inside `chunk_text`."""
from rapidfuzz import fuzz
if not excerpt or not chunk_text:
return 0.0
return float(fuzz.partial_ratio(_normalize_text(excerpt), _normalize_text(chunk_text)))
def excerpt_matches(
excerpt: str, chunk_text: str, threshold: float = EXCERPT_MATCH_THRESHOLD
) -> bool:
"""True when `excerpt` appears (fuzzily) as a substring of `chunk_text`."""
return excerpt_score(excerpt, chunk_text) >= threshold
# --------------------------------------------------------------------------
# locating the source chunk an extraction file came from
# --------------------------------------------------------------------------
def chunk_key_for(json_path: Path, header: Optional[dict]) -> str:
"""
Resolve the chunk key for an extraction file.
Prefers the explicit `chunk_key` in the header, otherwise falls back to the
JSON file stem (extraction files are named `<chunk_key>.json`).
"""
if header and header.get("chunk_key"):
return str(header["chunk_key"])
return json_path.stem
def source_id_for(chunk_key: str, header: Optional[dict]) -> str:
"""Resolve the source id; `<source_id>.partNN` → `<source_id>`."""
if header and header.get("source_id"):
return str(header["source_id"])
# chunk keys look like "<source_id>.partNN"
return chunk_key.rsplit(".part", 1)[0]
def find_chunk_text(
json_path: Path, header: Optional[dict], chunks_dir: Path
) -> Optional[str]:
"""
Return the text of the source chunk for an extraction file, or None.
Looks for data/chunks/<source_id>/<chunk_key>.txt, then falls back to a
recursive glob on the chunk key.
"""
chunk_key = chunk_key_for(json_path, header)
source_id = source_id_for(chunk_key, header)
candidate = chunks_dir / source_id / f"{chunk_key}.txt"
if candidate.is_file():
return candidate.read_text(encoding="utf-8", errors="replace")
matches = list(chunks_dir.rglob(f"{chunk_key}.txt"))
if matches:
return matches[0].read_text(encoding="utf-8", errors="replace")
return None
def source_path_for(source_id: str, sources_dir: Path) -> Optional[str]:
"""
Read the original `SOURCE:` path from a normalized source header.
data/sources/<source_id>.txt starts with a `SOURCE: <relative path>` line.
"""
src_file = sources_dir / f"{source_id}.txt"
if not src_file.is_file():
return None
try:
with src_file.open(encoding="utf-8", errors="replace") as fh:
for line in fh:
if line.startswith("SOURCE:"):
return line.split(":", 1)[1].strip()
if line.startswith("=") or line.startswith("--- PAGE "):
break
except OSError:
return None
return None
# --------------------------------------------------------------------------
# stable content key for the needs_review queue (plan §5c)
# --------------------------------------------------------------------------
def normalize_name(name: str) -> str:
"""Diacritic-free, lowercased, whitespace-collapsed name (dedup key)."""
if not name:
return ""
decomposed = unicodedata.normalize("NFKD", name)
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
return re.sub(r"\s+", " ", ascii_str.lower().strip())
def content_key(normalized_name: str, language: Optional[str], description: str) -> str:
"""
Stable hash identifying a row for the review queue.
Only borderline-kept-separate rows and legacy `.doc` rows ever carry
needs_review, and neither is auto-merged — so their (normalized_name,
language, description) triple is stable across rebuilds.
"""
payload = f"{normalized_name}\x1f{language or ''}\x1f{_normalize_text(description)}"
return hashlib.sha1(payload.encode("utf-8")).hexdigest()
# --------------------------------------------------------------------------
# iteration
# --------------------------------------------------------------------------
def iter_extraction_files(extracted_dir: Path):
"""Yield every *.json directly under `extracted_dir` (skips _rejected/)."""
if not extracted_dir.is_dir():
return
for path in sorted(extracted_dir.glob("*.json")):
if path.is_file():
yield path

View File

@@ -0,0 +1,255 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
normalize_sources.py — walk data/carti-camp-jocuri/ and write data/sources/<id>.txt.
Output files keep the existing header format:
SOURCE: <original relative path>
CONVERTED: <iso date>
FORMAT: <pdf|docx|doc|pptx|html-mirror|zip>
NEEDS_REVIEW: <reason> (optional — legacy .doc conversions)
==================================================
--- PAGE 1 ---
...
Each source gets a stable id = <8-hex hash of relative path>_<sanitized stem>,
so two files with the same name in different folders never collide.
The pipeline is script-only: this normalizes formats, it does NOT run extraction.
Run `--check-deps` before a long job.
"""
from __future__ import annotations
import argparse
import datetime as _dt
import hashlib
import re
import sys
from pathlib import Path
SCRIPT_DIR = Path(__file__).resolve().parent
if str(SCRIPT_DIR) not in sys.path:
sys.path.insert(0, str(SCRIPT_DIR))
from extract_common import ( # noqa: E402
count_page_markers,
dedupe_texts,
detect_format,
extract_file,
extract_html,
is_junk,
join_pages,
preflight,
split_pages,
)
HEADER_RULE = "=" * 50
# --------------------------------------------------------------------------
# stable source id
# --------------------------------------------------------------------------
def sanitize_stem(stem: str) -> str:
s = re.sub(r"[^\w]+", "_", stem, flags=re.UNICODE).strip("_").lower()
return s[:60] or "source"
def stable_id(relative_path: str | Path) -> str:
"""Collision-proof id derived from the path relative to the corpus root."""
rel = str(relative_path).replace("\\", "/")
digest = hashlib.sha1(rel.encode("utf-8")).hexdigest()[:8]
stem = sanitize_stem(Path(rel).stem)
return f"{digest}_{stem}"
# --------------------------------------------------------------------------
# header
# --------------------------------------------------------------------------
def build_header(
source_rel: str, fmt: str, needs_review: str | None = None
) -> str:
today = _dt.date.today().isoformat()
lines = [
f"SOURCE: {source_rel}",
f"CONVERTED: {today}",
f"FORMAT: {fmt}",
]
if needs_review:
lines.append(f"NEEDS_REVIEW: {needs_review}")
lines.append(HEADER_RULE)
return "\n".join(lines) + "\n\n"
# --------------------------------------------------------------------------
# mirror-site directories
# --------------------------------------------------------------------------
MIRROR_PAGE_EXTS = {".html", ".htm"}
def is_mirror_dir(path: Path) -> bool:
"""A directory counts as a site mirror if it contains HTML pages."""
if not path.is_dir():
return False
if path.name.endswith("_files"):
return False
return any(
p.suffix.lower() in MIRROR_PAGE_EXTS
for p in path.rglob("*")
if p.is_file()
)
def normalize_mirror(mirror_dir: Path) -> str:
"""Extract every HTML page in a mirror dir, dedupe near-duplicates, join."""
pages: list[tuple[str, str]] = []
for html in sorted(mirror_dir.rglob("*")):
if not html.is_file() or html.suffix.lower() not in MIRROR_PAGE_EXTS:
continue
if "_files" in html.parts:
continue
try:
body = extract_html(html)
except Exception:
continue
text = "\n".join(t for _, t in split_pages(body))
if text.strip():
pages.append((str(html.relative_to(mirror_dir)), text))
pages = dedupe_texts(pages)
return join_pages([t for _, t in pages])
# --------------------------------------------------------------------------
# one source
# --------------------------------------------------------------------------
def normalize_one(
path: Path, corpus_root: Path, out_dir: Path
) -> dict | None:
"""
Normalize a single file or mirror directory → data/sources/<id>.txt.
Returns a result dict, or None if the entry was skipped (junk / ignored).
"""
rel = path.relative_to(corpus_root)
sid = stable_id(rel)
if path.is_dir():
if not is_mirror_dir(path):
return None
fmt, needs_review = "html-mirror", None
body = normalize_mirror(path)
else:
if is_junk(path):
return None
fmt = detect_format(path)
if fmt in ("unknown", "epub", "txt"):
return None # epub duplicates PDFs; txt is not a source format here
needs_review = "legacy .doc conversion is imperfect" if fmt == "doc" else None
try:
body = extract_file(path)
except Exception as exc: # noqa: BLE001
return {"id": sid, "source": str(rel), "status": "error", "error": str(exc)}
if not body.strip():
return {"id": sid, "source": str(rel), "status": "empty"}
out_path = out_dir / f"{sid}.txt"
out_path.write_text(build_header(str(rel), fmt, needs_review) + body,
encoding="utf-8")
return {
"id": sid,
"source": str(rel),
"status": "ok",
"format": fmt,
"pages": count_page_markers(body),
"needs_review": bool(needs_review),
}
# --------------------------------------------------------------------------
# walk
# --------------------------------------------------------------------------
def iter_corpus_entries(corpus_root: Path):
"""Yield top-level files and mirror directories under the corpus root."""
for entry in sorted(corpus_root.iterdir()):
if entry.name.startswith("."):
continue
if entry.is_dir():
if is_mirror_dir(entry):
yield entry
else:
yield entry
def run(corpus_root: Path, out_dir: Path) -> dict:
out_dir.mkdir(parents=True, exist_ok=True)
results: list[dict] = []
for entry in iter_corpus_entries(corpus_root):
res = normalize_one(entry, corpus_root, out_dir)
if res is not None:
results.append(res)
summary = {
"total": len(results),
"ok": sum(1 for r in results if r["status"] == "ok"),
"errors": sum(1 for r in results if r["status"] == "error"),
"empty": sum(1 for r in results if r["status"] == "empty"),
"needs_review": sum(1 for r in results if r.get("needs_review")),
"results": results,
}
return summary
# --------------------------------------------------------------------------
# CLI
# --------------------------------------------------------------------------
def print_preflight(report: dict) -> int:
print("Dependency preflight")
print("--------------------")
if report["missing_python"]:
print(" MISSING Python packages: " + ", ".join(report["missing_python"]))
else:
print(" Python packages: OK")
if report["missing_system"]:
print(" MISSING system tools : " + ", ".join(report["missing_system"]))
for w in report["warnings"]:
print(f" WARNING: {w}")
print(" => " + ("READY" if report["ok"] else "NOT READY — install the above"))
return 0 if report["ok"] else 1
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Normalize mixed sources to .txt")
parser.add_argument("--corpus", default="data/carti-camp-jocuri",
help="corpus root to walk")
parser.add_argument("--out", default="data/sources", help="output directory")
parser.add_argument("--check-deps", action="store_true",
help="run dependency preflight and exit")
parser.add_argument("--ocr", action="store_true",
help="include OCR (tesseract) in the preflight check")
args = parser.parse_args(argv)
if args.check_deps:
return print_preflight(preflight(check_ocr=args.ocr))
report = preflight(check_ocr=args.ocr)
if report["missing_python"]:
print_preflight(report)
return 1
for w in report["warnings"]:
print(f"WARNING: {w}")
summary = run(Path(args.corpus), Path(args.out))
print(f"normalized : {summary['ok']}/{summary['total']}")
print(f"errors : {summary['errors']}")
print(f"empty : {summary['empty']}")
print(f"needs_review: {summary['needs_review']}")
for r in summary["results"]:
if r["status"] != "ok":
print(f" [{r['status']}] {r['source']}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -1,143 +0,0 @@
#!/usr/bin/env python3
"""
PDF Mass Conversion to Text for Activity Extraction
Handles all PDF sizes efficiently with multiple fallback methods
"""
import os
import json
from pathlib import Path
import PyPDF2
import pdfplumber
from typing import List, Dict
import logging
class PDFConverter:
def __init__(self, max_pages=50):
self.max_pages = max_pages
self.conversion_stats = {}
def convert_pdf_to_text(self, pdf_path: str) -> str:
"""Convert PDF to text using multiple methods with fallbacks"""
try:
# Method 1: pdfplumber (best for tables and layout)
return self._convert_with_pdfplumber(pdf_path)
except Exception as e:
print(f"pdfplumber failed for {pdf_path}: {e}")
try:
# Method 2: PyPDF2 (fallback)
return self._convert_with_pypdf2(pdf_path)
except Exception as e2:
print(f"PyPDF2 also failed for {pdf_path}: {e2}")
return ""
def _convert_with_pdfplumber(self, pdf_path: str) -> str:
"""Primary conversion method using pdfplumber"""
text_content = ""
with pdfplumber.open(pdf_path) as pdf:
total_pages = len(pdf.pages)
pages_to_process = min(total_pages, self.max_pages)
print(f" Converting {pdf_path}: {pages_to_process}/{total_pages} pages")
for i, page in enumerate(pdf.pages[:pages_to_process]):
try:
page_text = page.extract_text()
if page_text:
text_content += f"\n--- PAGE {i+1} ---\n"
text_content += page_text
text_content += "\n"
except Exception as e:
print(f" Error on page {i+1}: {e}")
continue
self.conversion_stats[pdf_path] = {
'method': 'pdfplumber',
'pages_processed': pages_to_process,
'total_pages': total_pages,
'success': True,
'text_length': len(text_content)
}
return text_content
def _convert_with_pypdf2(self, pdf_path: str) -> str:
"""Fallback conversion method using PyPDF2"""
text_content = ""
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
total_pages = len(reader.pages)
pages_to_process = min(total_pages, self.max_pages)
print(f" Converting {pdf_path} (fallback): {pages_to_process}/{total_pages} pages")
for i in range(pages_to_process):
try:
page = reader.pages[i]
page_text = page.extract_text()
if page_text:
text_content += f"\n--- PAGE {i+1} ---\n"
text_content += page_text
text_content += "\n"
except Exception as e:
print(f" Error on page {i+1}: {e}")
continue
self.conversion_stats[pdf_path] = {
'method': 'PyPDF2',
'pages_processed': pages_to_process,
'total_pages': total_pages,
'success': True,
'text_length': len(text_content)
}
return text_content
def convert_all_pdfs(self, pdf_directory: str, output_directory: str):
"""Convert all PDFs in directory to text files"""
pdf_files = list(Path(pdf_directory).glob("**/*.pdf"))
print(f"🔄 Converting {len(pdf_files)} PDF files to text...")
os.makedirs(output_directory, exist_ok=True)
for i, pdf_path in enumerate(pdf_files):
print(f"\n[{i+1}/{len(pdf_files)}] Processing {pdf_path.name}...")
# Convert to text
text_content = self.convert_pdf_to_text(str(pdf_path))
if text_content.strip():
# Save as text file
output_file = Path(output_directory) / f"{pdf_path.stem}.txt"
with open(output_file, 'w', encoding='utf-8') as f:
f.write(f"SOURCE: {pdf_path}\n")
f.write(f"CONVERTED: 2025-01-11\n")
f.write("="*50 + "\n\n")
f.write(text_content)
print(f" ✅ Saved: {output_file}")
else:
print(f" ❌ No text extracted from {pdf_path.name}")
# Save conversion statistics
stats_file = Path(output_directory) / "conversion_stats.json"
with open(stats_file, 'w', encoding='utf-8') as f:
json.dump(self.conversion_stats, f, indent=2, ensure_ascii=False)
print(f"\n🎉 PDF conversion complete! Check {output_directory}")
return len([f for f in self.conversion_stats.values() if f['success']])
# Usage
if __name__ == "__main__":
converter = PDFConverter(max_pages=50)
# Convert all PDFs
pdf_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri"
output_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri/INDEX-SISTEM-JOCURI/converted_pdfs"
converted_count = converter.convert_all_pdfs(pdf_dir, output_dir)
print(f"Final result: {converted_count} PDFs successfully converted")

145
scripts/review_queue.py Normal file
View File

@@ -0,0 +1,145 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
review_queue.py — CLI for the needs_review lifecycle (plan §5c).
Rows land in the queue when dedup leaves a borderline pair separate, or when a
legacy `.doc` source was converted imperfectly. Each row has a stable content
key; a decision written here is stored in data/review_decisions.json (git
tracked) and re-applied by build_database.py on every rebuild, so the queue
never resurfaces a resolved row.
Commands:
python scripts/review_queue.py list
python scripts/review_queue.py resolve <id> <merge|keep-separate|drop>
"""
from __future__ import annotations
import argparse
import json
import sqlite3
import sys
from pathlib import Path
from typing import Optional
SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
if _p not in sys.path:
sys.path.insert(0, _p)
from import_common import content_key, normalize_name # noqa: E402
VALID_DECISIONS = ("merge", "keep-separate", "drop")
# --------------------------------------------------------------------------
# review_decisions.json
# --------------------------------------------------------------------------
def load_decisions(path: Path) -> dict:
if path.is_file():
try:
data = json.loads(path.read_text(encoding="utf-8"))
if isinstance(data, dict):
return data
except (json.JSONDecodeError, OSError):
pass
return {}
def save_decisions(decisions: dict, path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(
json.dumps(decisions, indent=2, ensure_ascii=False, sort_keys=True),
encoding="utf-8",
)
# --------------------------------------------------------------------------
# queue
# --------------------------------------------------------------------------
def list_queue(db_path: Path) -> list[dict]:
"""Return every needs_review row in the current DB, with its content key."""
if not db_path.is_file():
return []
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
try:
rows = conn.execute(
"SELECT name, normalized_name, language, description "
"FROM activities WHERE needs_review = 1 ORDER BY normalized_name"
).fetchall()
except sqlite3.OperationalError:
return []
finally:
conn.close()
out = []
for row in rows:
norm = row["normalized_name"] or normalize_name(row["name"])
key = content_key(norm, row["language"], row["description"] or "")
out.append({
"id": key,
"name": row["name"],
"language": row["language"],
"description": row["description"] or "",
})
return out
def resolve(decisions_path: Path, content_id: str, decision: str) -> dict:
"""Record a decision for a content key in review_decisions.json."""
if decision not in VALID_DECISIONS:
raise ValueError(
f"invalid decision {decision!r}; expected one of {VALID_DECISIONS}"
)
decisions = load_decisions(decisions_path)
decisions[content_id] = {"decision": decision}
save_decisions(decisions, decisions_path)
return decisions
# --------------------------------------------------------------------------
# CLI
# --------------------------------------------------------------------------
def main(argv: Optional[list[str]] = None) -> int:
parser = argparse.ArgumentParser(description="needs_review queue CLI")
parser.add_argument("--db", default="data/activities.db")
parser.add_argument("--decisions", default="data/review_decisions.json")
sub = parser.add_subparsers(dest="command", required=True)
sub.add_parser("list", help="list rows currently flagged needs_review")
p_resolve = sub.add_parser("resolve", help="record a decision for a row")
p_resolve.add_argument("id", help="content id from `list`")
p_resolve.add_argument("decision", choices=VALID_DECISIONS)
args = parser.parse_args(argv)
if args.command == "list":
rows = list_queue(Path(args.db))
if not rows:
print("review queue is empty.")
return 0
print(f"{len(rows)} row(s) need review:\n")
for r in rows:
desc = r["description"][:80].replace("\n", " ")
print(f" id : {r['id']}")
print(f" name : {r['name']} [{r['language']}]")
print(f" desc : {desc}")
print(f" -> review_queue.py resolve {r['id']} <merge|keep-separate|drop>")
print()
return 0
if args.command == "resolve":
resolve(Path(args.decisions), args.id, args.decision)
print(f"recorded: {args.id} -> {args.decision}")
print(f"written to {args.decisions} (applied on next build_database --rebuild)")
return 0
return 1
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -1,50 +1,140 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Main extraction orchestrator
Ruleaza intregul proces de extractie
run_extraction.py — extraction orchestrator (plan §3).
The pipeline is script-only up to the LLM step: this script normalizes the
corpus, chunks the normalized sources, and emits one subagent prompt per
`pending` chunk. It does NOT run the extraction itself — that step is the
interactive Claude Code orchestrator launching waves of subagents.
Steps:
1. normalize data/carti-camp-jocuri/ -> data/sources/*.txt
2. chunk data/sources/*.txt -> data/chunks/<id>/*.txt + manifest.json
3. emit one prompt per `pending` chunk -> data/chunks/_prompts/*.md
4. report how many chunks remain `pending`
Usage:
python scripts/run_extraction.py
python scripts/run_extraction.py --skip-normalize # re-chunk only
"""
from __future__ import annotations
import argparse
import sys
import time
from pathlib import Path
from typing import Optional
from unified_processor import UnifiedProcessor
from import_claude_activities import ClaudeActivityImporter
SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
if _p not in sys.path:
sys.path.insert(0, _p)
def main():
print("="*60)
print("ACTIVITY EXTRACTION SYSTEM")
print("Strategy S8: Hybrid Claude + Scripts")
print("="*60)
import chunk_sources # noqa: E402
import normalize_sources # noqa: E402
# Step 1: Run automated extraction
print("\nSTEP 1: Automated Extraction")
print("-"*40)
processor = UnifiedProcessor()
processor.process_automated_formats()
SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"
# Step 2: Wait for Claude processing
print("\n" + "="*60)
print("STEP 2: Manual Claude Processing Required")
print("-"*40)
print("Please process PDF/DOC files with Claude using the template.")
print("Files are listed in: pdf_doc_for_claude.txt")
print("Save extracted activities as JSON in: scripts/extracted_activities/")
print("="*60)
response = input("\nHave you completed Claude processing? (y/n): ")
def emit_chunk_prompt(chunk_key: str, meta: dict, prompts_dir: Path) -> Path:
"""Write the subagent prompt for one pending chunk."""
chunk_file = meta.get("chunk_file", f"data/chunks/<id>/{chunk_key}.txt")
expected_json = meta.get("expected_json", f"{chunk_key}.json")
text = "\n".join([
f"# EXTRACTION — chunk `{chunk_key}`",
"",
f"Read ONLY this chunk: `{chunk_file}`",
f"Chunk range: {meta.get('chunk_range', '?')}",
"",
f"Follow the rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
"Identify every distinct activity, fill the schema "
"(`scripts/activity_schema.json`), and write the result to:",
"",
f" data/extracted/{expected_json}",
"",
"Header fields to set: "
f'source_id="{meta.get("source_id", "")}", chunk_key="{chunk_key}", '
f'source_hash="{meta.get("source_hash", "")}".',
"",
])
prompts_dir.mkdir(parents=True, exist_ok=True)
out = prompts_dir / f"{chunk_key}.prompt.md"
out.write_text(text, encoding="utf-8")
return out
if response.lower() == 'y':
# Step 3: Import Claude-extracted activities
print("\nSTEP 3: Importing Claude-extracted activities")
print("-"*40)
importer = ClaudeActivityImporter()
importer.import_all_json_files()
print("\n" + "="*60)
print("EXTRACTION COMPLETE!")
print("="*60)
def run(
*,
corpus_root: Path,
sources_dir: Path,
chunks_dir: Path,
skip_normalize: bool = False,
) -> dict:
summary: dict = {}
if not skip_normalize:
norm = normalize_sources.run(corpus_root, sources_dir)
summary["normalized"] = {"ok": norm["ok"], "total": norm["total"],
"errors": norm["errors"]}
chunk_summary = chunk_sources.run(sources_dir, chunks_dir)
summary["chunks"] = chunk_summary
manifest_path = chunks_dir / "manifest.json"
manifest = chunk_sources.load_manifest(manifest_path)
prompts_dir = chunks_dir / "_prompts"
pending = {k: m for k, m in manifest["chunks"].items()
if m.get("state") == "pending"}
for key, meta in sorted(pending.items()):
emit_chunk_prompt(key, meta, prompts_dir)
states: dict[str, int] = {}
for m in manifest["chunks"].values():
states[m.get("state", "?")] = states.get(m.get("state", "?"), 0) + 1
summary["states"] = states
summary["pending"] = len(pending)
summary["prompts_dir"] = str(prompts_dir)
return summary
def main(argv: Optional[list[str]] = None) -> int:
parser = argparse.ArgumentParser(description="Extraction orchestrator.")
parser.add_argument("--corpus", default="data/carti-camp-jocuri")
parser.add_argument("--sources", default="data/sources")
parser.add_argument("--chunks", default="data/chunks")
parser.add_argument("--skip-normalize", action="store_true",
help="skip normalization, re-chunk existing sources only")
args = parser.parse_args(argv)
summary = run(
corpus_root=Path(args.corpus),
sources_dir=Path(args.sources),
chunks_dir=Path(args.chunks),
skip_normalize=args.skip_normalize,
)
print("=" * 60)
print("EXTRACTION ORCHESTRATOR")
print("=" * 60)
if "normalized" in summary:
n = summary["normalized"]
print(f"normalized : {n['ok']}/{n['total']} (errors {n['errors']})")
print(f"chunks : {summary['chunks']['chunks']}")
for state, count in sorted(summary["states"].items()):
print(f" {state:<10}: {count}")
print(f"\npending chunks remaining : {summary['pending']}")
if summary["pending"]:
print(f"subagent prompts written : {summary['prompts_dir']}/")
print("Launch waves of ~5-10 subagents on those prompts, then run "
"validate_extractions.py and build_database.py --rebuild.")
else:
print("All chunks extracted — run build_database.py --rebuild.")
print("=" * 60)
return 0
if __name__ == "__main__":
main()
raise SystemExit(main())

View File

@@ -1,197 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Text/Markdown Activity Extractor
Proceseaza fisiere TXT si MD pentru extractie activitati
"""
import re
from pathlib import Path
from typing import List, Dict
import sqlite3
from datetime import datetime
class TextActivityExtractor:
def __init__(self, db_path='data/activities.db'):
self.db_path = db_path
self.activity_patterns = {
'section_headers': [
r'^#{1,6}\s*(.+)$', # Markdown headers
r'^([A-Z][^\.]{10,100})$', # Titluri simple
r'^\d+\.\s*(.+)$', # Numbered lists
r'^[•\-\*]\s*(.+)$', # Bullet points
],
'activity_markers': [
'joc:', 'activitate:', 'exercitiu:', 'team building:',
'nume:', 'titlu:', 'denumire:'
]
}
def extract_from_text(self, file_path: str) -> List[Dict]:
"""Extrage activitati din fisier text/markdown"""
activities = []
try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
# Metoda 1: Cauta sectiuni markdown
if file_path.endswith('.md'):
activities.extend(self._extract_from_markdown(content, file_path))
# Metoda 2: Cauta pattern-uri generale
activities.extend(self._extract_from_patterns(content, file_path))
# Metoda 3: Cauta blocuri de text structurate
activities.extend(self._extract_from_blocks(content, file_path))
except Exception as e:
print(f"Error processing {file_path}: {e}")
return activities
def _extract_from_markdown(self, content, source_file):
"""Extrage activitati din format markdown"""
activities = []
lines = content.split('\n')
current_activity = None
current_content = []
for line in lines:
# Verifica daca e header de activitate
if re.match(r'^#{1,3}\s*(.+)', line):
# Salveaza activitatea anterioara daca exista
if current_activity and current_content:
current_activity['description'] = '\n'.join(current_content[:20]) # Max 20 linii
activities.append(current_activity)
# Verifica daca noul header e o activitate
header_text = re.sub(r'^#{1,3}\s*', '', line)
if any(marker in header_text.lower() for marker in ['joc', 'activitate', 'exercitiu']):
current_activity = {
'name': header_text[:200],
'source_file': str(source_file),
'category': '[A]'
}
current_content = []
else:
current_activity = None
elif current_activity:
# Adauga continut la activitatea curenta
if line.strip():
current_content.append(line)
# Salveaza ultima activitate
if current_activity and current_content:
current_activity['description'] = '\n'.join(current_content[:20])
activities.append(current_activity)
return activities
def _extract_from_patterns(self, content, source_file):
"""Extrage folosind pattern matching"""
activities = []
# Cauta markeri specifici de activitati
for marker in self.activity_patterns['activity_markers']:
pattern = re.compile(f'{re.escape(marker)}\\s*(.+?)(?=\\n\\n|{re.escape(marker)}|$)',
re.IGNORECASE | re.DOTALL)
matches = pattern.finditer(content)
for match in matches:
activity_text = match.group(1)
if len(activity_text) > 20:
activity = {
'name': activity_text.split('\n')[0][:200],
'description': activity_text[:1000],
'source_file': str(source_file),
'category': '[A]'
}
activities.append(activity)
return activities
def _extract_from_blocks(self, content, source_file):
"""Extrage din blocuri de text separate"""
activities = []
# Imparte in blocuri separate de linii goale
blocks = re.split(r'\n\s*\n', content)
for block in blocks:
if len(block) > 50: # Minim 50 caractere
lines = block.strip().split('\n')
first_line = lines[0].strip()
# Verifica daca blocul pare o activitate
if any(keyword in first_line.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
activity = {
'name': first_line[:200],
'description': block[:1000],
'source_file': str(source_file),
'category': '[A]'
}
activities.append(activity)
return activities
def save_to_database(self, activities):
"""Salveaza in baza de date"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
saved_count = 0
for activity in activities:
try:
# Check for duplicates
cursor.execute(
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
(activity.get('name'), activity.get('source_file'))
)
if not cursor.fetchone():
columns = list(activity.keys())
values = list(activity.values())
placeholders = ['?' for _ in values]
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
cursor.execute(query, values)
saved_count += 1
except Exception as e:
print(f"Error saving: {e}")
conn.commit()
conn.close()
return saved_count
def process_all_text_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
"""Proceseaza toate fisierele text si markdown"""
base_path = Path(base_path)
text_files = list(base_path.rglob("*.txt"))
md_files = list(base_path.rglob("*.md"))
all_files = text_files + md_files
print(f"Found {len(all_files)} text/markdown files")
all_activities = []
for file_path in all_files:
activities = self.extract_from_text(str(file_path))
all_activities.extend(activities)
print(f"Processed {file_path.name}: {len(activities)} activities")
# Save to database
saved = self.save_to_database(all_activities)
print(f"\nTotal saved: {saved} activities from {len(all_files)} files")
return len(all_files), saved
if __name__ == "__main__":
extractor = TextActivityExtractor()
extractor.process_all_text_files()

View File

@@ -1,151 +0,0 @@
#!/usr/bin/env python3
"""
Unified Activity Processor
Orchestreaz toate extractoarele pentru procesare complet
"""
import time
from pathlib import Path
from html_extractor import HTMLActivityExtractor
from text_extractor import TextActivityExtractor
import sqlite3
class UnifiedProcessor:
def __init__(self, db_path='data/activities.db'):
self.db_path = db_path
self.html_extractor = HTMLActivityExtractor(db_path)
self.text_extractor = TextActivityExtractor(db_path)
self.stats = {
'html_processed': 0,
'text_processed': 0,
'pdf_to_process': 0,
'doc_to_process': 0,
'total_activities': 0,
'start_time': None,
'end_time': None
}
def get_current_activity_count(self):
"""Obine numrul curent de activiti din DB"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM activities")
count = cursor.fetchone()[0]
conn.close()
return count
def count_files_to_process(self, base_path):
"""Numr fiierele care trebuie procesate"""
base_path = Path(base_path)
counts = {
'html': len(list(base_path.rglob("*.html"))) + len(list(base_path.rglob("*.htm"))),
'txt': len(list(base_path.rglob("*.txt"))),
'md': len(list(base_path.rglob("*.md"))),
'pdf': len(list(base_path.rglob("*.pdf"))),
'doc': len(list(base_path.rglob("*.doc"))),
'docx': len(list(base_path.rglob("*.docx")))
}
return counts
def process_automated_formats(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
"""Proceseaz toate formatele care pot fi automatizate"""
print("="*60)
print("UNIFIED ACTIVITY PROCESSOR - AUTOMATED PHASE")
print("="*60)
self.stats['start_time'] = time.time()
initial_count = self.get_current_activity_count()
# Afieaz statistici iniiale
file_counts = self.count_files_to_process(base_path)
print(f"\nFiles to process:")
for format, count in file_counts.items():
print(f" {format.upper()}: {count} files")
print(f"\nCurrent activities in database: {initial_count}")
print("-"*60)
# FAZA 1: Procesare HTML (prioritate maxim - volum mare)
print("\n[1/2] Processing HTML files...")
print("-"*40)
html_processed, html_errors = self.html_extractor.process_all_html_files(base_path)
self.stats['html_processed'] = html_processed
# FAZA 2: Procesare Text/MD
print("\n[2/2] Processing Text/Markdown files...")
print("-"*40)
text_processed, text_saved = self.text_extractor.process_all_text_files(base_path)
self.stats['text_processed'] = text_processed
# Statistici finale
self.stats['end_time'] = time.time()
final_count = self.get_current_activity_count()
self.stats['total_activities'] = final_count - initial_count
# Identific fiierele care necesit procesare manual
self.stats['pdf_to_process'] = file_counts['pdf']
self.stats['doc_to_process'] = file_counts['doc'] + file_counts['docx']
self.print_summary()
self.save_pdf_doc_list(base_path)
def print_summary(self):
"""Afieaz rezumatul procesrii"""
print("\n" + "="*60)
print("PROCESSING SUMMARY")
print("="*60)
duration = self.stats['end_time'] - self.stats['start_time']
print(f"\nAutomated Processing Results:")
print(f" HTML files processed: {self.stats['html_processed']}")
print(f" Text/MD files processed: {self.stats['text_processed']}")
print(f" New activities added: {self.stats['total_activities']}")
print(f" Processing time: {duration:.1f} seconds")
print(f"\nFiles requiring Claude processing:")
print(f" PDF files: {self.stats['pdf_to_process']}")
print(f" DOC/DOCX files: {self.stats['doc_to_process']}")
print("\n" + "="*60)
print("NEXT STEPS:")
print("1. Review the file 'pdf_doc_for_claude.txt' for manual processing")
print("2. Use Claude to extract activities from PDF/DOC files")
print("3. Focus on largest PDF files first (highest activity density)")
print("="*60)
def save_pdf_doc_list(self, base_path):
"""Salveaz lista de PDF/DOC pentru procesare cu Claude"""
base_path = Path(base_path)
pdf_files = sorted(base_path.rglob("*.pdf"), key=lambda p: p.stat().st_size, reverse=True)
doc_files = list(base_path.rglob("*.doc"))
docx_files = list(base_path.rglob("*.docx"))
with open('pdf_doc_for_claude.txt', 'w', encoding='utf-8') as f:
f.write("PDF/DOC FILES FOR CLAUDE PROCESSING\n")
f.write("="*60 + "\n")
f.write("Files sorted by size (largest first = likely more activities)\n\n")
f.write("TOP PRIORITY PDF FILES (process these first):\n")
f.write("-"*40 + "\n")
for i, pdf in enumerate(pdf_files[:20], 1):
size_mb = pdf.stat().st_size / (1024*1024)
f.write(f"{i}. {pdf.name} ({size_mb:.1f} MB)\n")
f.write(f" Path: {pdf}\n\n")
if len(pdf_files) > 20:
f.write(f"\n... and {len(pdf_files)-20} more PDF files\n\n")
f.write("\nDOC/DOCX FILES:\n")
f.write("-"*40 + "\n")
for doc in doc_files + docx_files:
size_kb = doc.stat().st_size / 1024
f.write(f"- {doc.name} ({size_kb:.1f} KB)\n")
print(f"\nPDF/DOC list saved to: pdf_doc_for_claude.txt")
if __name__ == "__main__":
processor = UnifiedProcessor()
processor.process_automated_formats()

View File

@@ -0,0 +1,208 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
validate_extractions.py — validate every data/extracted/*.json (plan §5b).
For each extraction file it runs two checks:
1. JSON-schema validation against scripts/activity_schema.json,
2. the source_excerpt anti-hallucination check (each excerpt must be a fuzzy
substring of the chunk it came from).
For every failing chunk it:
* writes the exact re-extraction prompt to data/extracted/_reextract/<chunk>.prompt.md,
* marks the chunk `rejected` in data/chunks/manifest.json.
The orchestrator then re-launches subagents only on the `rejected` chunks; the
loop repeats until nothing is rejected.
Usage:
python scripts/validate_extractions.py
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from typing import Optional
SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
if _p not in sys.path:
sys.path.insert(0, _p)
from import_common import ( # noqa: E402
DEFAULT_SCHEMA_PATH,
chunk_key_for,
excerpt_matches,
excerpt_score,
find_chunk_text,
iter_extraction_files,
load_schema,
validate_extraction,
)
SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"
# --------------------------------------------------------------------------
# re-extraction prompt
# --------------------------------------------------------------------------
def build_reextraction_prompt(
chunk_key: str, chunk_file: Optional[str], errors: list[str]
) -> str:
"""The exact prompt to hand a subagent to re-extract a rejected chunk."""
chunk_ref = chunk_file or f"data/chunks/<source_id>/{chunk_key}.txt"
lines = [
f"# RE-EXTRACTION — chunk `{chunk_key}`",
"",
"The previous extraction for this chunk was **REJECTED**. Reasons:",
"",
]
lines += [f"- {e}" for e in errors]
lines += [
"",
"## What to do",
"",
f"1. Read ONLY this chunk: `{chunk_ref}`",
f"2. Follow the extraction rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
"3. Fix every problem listed above. In particular:",
" - every `source_excerpt` must be copied **verbatim** from the chunk",
" (it is checked as a fuzzy substring — invented quotes are rejected);",
" - `source_excerpt` and `page_reference` are mandatory on every activity;",
" - the output must validate against `scripts/activity_schema.json`.",
f"4. Overwrite the extraction file `data/extracted/{chunk_key}.json`.",
"",
]
return "\n".join(lines)
# --------------------------------------------------------------------------
# manifest
# --------------------------------------------------------------------------
def load_manifest(manifest_path: Path) -> dict:
if manifest_path.is_file():
try:
data = json.loads(manifest_path.read_text(encoding="utf-8"))
data.setdefault("chunks", {})
return data
except (json.JSONDecodeError, OSError):
pass
return {"chunks": {}}
def save_manifest(manifest: dict, manifest_path: Path) -> None:
manifest_path.parent.mkdir(parents=True, exist_ok=True)
manifest_path.write_text(
json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8"
)
def mark_rejected(manifest: dict, chunk_key: str) -> None:
"""Flip a chunk to `rejected` in the manifest (creating the entry if new)."""
entry = manifest["chunks"].get(chunk_key, {})
entry["state"] = "rejected"
manifest["chunks"][chunk_key] = entry
# --------------------------------------------------------------------------
# validation
# --------------------------------------------------------------------------
def validate_file(json_path: Path, schema: dict, chunks_dir: Path) -> list[str]:
"""Return the list of errors for one extraction file (empty == valid)."""
try:
data = json.loads(json_path.read_text(encoding="utf-8"))
except json.JSONDecodeError as exc:
return [f"invalid JSON: {exc}"]
errors = validate_extraction(data, schema)
if errors:
return errors
header = data.get("header", {})
chunk_text = find_chunk_text(json_path, header, chunks_dir)
if chunk_text is None:
return [f"source chunk not found for {chunk_key_for(json_path, header)}"]
for adict in data.get("activities", []):
excerpt = adict.get("source_excerpt") or ""
if not excerpt_matches(excerpt, chunk_text):
score = excerpt_score(excerpt, chunk_text)
errors.append(
f"activity {adict.get('name')!r}: source_excerpt not found in "
f"chunk (best match {score:.0f}/100) — possible hallucination"
)
return errors
def run(
extracted_dir: Path,
chunks_dir: Path,
manifest_path: Path,
schema_path: Path = DEFAULT_SCHEMA_PATH,
) -> dict:
schema = load_schema(schema_path)
manifest = load_manifest(manifest_path)
reextract_dir = extracted_dir / "_reextract"
report = {"total": 0, "valid": 0, "rejected": 0, "rejected_chunks": []}
for json_path in iter_extraction_files(extracted_dir):
report["total"] += 1
errors = validate_file(json_path, schema, chunks_dir)
if not errors:
report["valid"] += 1
continue
report["rejected"] += 1
try:
data = json.loads(json_path.read_text(encoding="utf-8"))
header = data.get("header", {})
except json.JSONDecodeError:
header = {}
chunk_key = chunk_key_for(json_path, header)
chunk_file = None
meta = manifest["chunks"].get(chunk_key)
if meta:
chunk_file = meta.get("chunk_file")
reextract_dir.mkdir(parents=True, exist_ok=True)
prompt = build_reextraction_prompt(chunk_key, chunk_file, errors)
(reextract_dir / f"{chunk_key}.prompt.md").write_text(prompt, encoding="utf-8")
mark_rejected(manifest, chunk_key)
report["rejected_chunks"].append({"chunk": chunk_key, "errors": errors})
save_manifest(manifest, manifest_path)
return report
# --------------------------------------------------------------------------
# CLI
# --------------------------------------------------------------------------
def main(argv: Optional[list[str]] = None) -> int:
parser = argparse.ArgumentParser(description="Validate extraction JSON files.")
parser.add_argument("--extracted", default="data/extracted")
parser.add_argument("--chunks", default="data/chunks")
parser.add_argument("--manifest", default="data/chunks/manifest.json")
parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH))
args = parser.parse_args(argv)
report = run(
Path(args.extracted), Path(args.chunks), Path(args.manifest), Path(args.schema)
)
print(f"extraction files : {report['total']}")
print(f" valid : {report['valid']}")
print(f" rejected : {report['rejected']}")
for item in report["rejected_chunks"]:
print(f" [rejected] {item['chunk']}")
for err in item["errors"]:
print(f" - {err}")
if report["rejected"]:
print(f"\nRe-extraction prompts written to {args.extracted}/_reextract/")
return 0
if __name__ == "__main__":
raise SystemExit(main())

114
tests/conftest.py Normal file
View File

@@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
"""
Shared pytest fixtures for the extraction-pipeline tests.
scripts/ is not a package, so it is added to sys.path here. Synthetic fixtures
(PDF, docx, zip, HTML) are generated at runtime — no binary blobs in the repo.
"""
import sys
import zipfile
from pathlib import Path
import pytest
REPO_ROOT = Path(__file__).resolve().parent.parent
SCRIPTS_DIR = REPO_ROOT / "scripts"
if str(SCRIPTS_DIR) not in sys.path:
sys.path.insert(0, str(SCRIPTS_DIR))
# --------------------------------------------------------------------------
# synthetic PDF — deliberately large to pin the "no max_pages" regression
# --------------------------------------------------------------------------
@pytest.fixture
def big_pdf(tmp_path):
"""A 60-page PDF; each page carries a unique 'PDFMARK-<n>' token."""
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
path = tmp_path / "big.pdf"
c = canvas.Canvas(str(path), pagesize=letter)
for n in range(1, 61):
c.drawString(72, 720, f"PDFMARK-{n} synthetic activity page number {n}")
c.drawString(72, 700, "Acest joc educativ se joaca in echipa.")
c.showPage()
c.save()
return path
# --------------------------------------------------------------------------
# synthetic docx — 100 paragraphs => 3 synthetic pages at 40 paras/page
# --------------------------------------------------------------------------
@pytest.fixture
def sample_docx(tmp_path):
import docx
path = tmp_path / "sample.docx"
document = docx.Document()
for i in range(100):
document.add_paragraph(f"Paragraf {i}: continut joc team-building.")
document.save(str(path))
return path
# --------------------------------------------------------------------------
# synthetic HTML mirror page — with nav/script/footer chrome to strip
# --------------------------------------------------------------------------
HTML_WITH_NAV = """<!doctype html>
<html><head><title>Joc</title>
<style>.x{color:red}</style>
<script>var tracking = 1;</script>
</head><body>
<nav><a href="/">Home</a><a href="/games">Games</a></nav>
<header>Site Banner Junk</header>
<main>
<h1>Vanatoarea de comori</h1>
<p>Acesta este un joc real de orientare pentru cercetasi.</p>
<p>Jucatorii cauta indicii ascunse in tabara.</p>
</main>
<footer>Copyright 2024 - toate drepturile rezervate</footer>
</body></html>
"""
@pytest.fixture
def html_with_nav(tmp_path):
path = tmp_path / "page.html"
path.write_text(HTML_WITH_NAV, encoding="utf-8")
return path
# --------------------------------------------------------------------------
# synthetic zip — contains a docx and a stray junk file
# --------------------------------------------------------------------------
@pytest.fixture
def sample_zip(tmp_path, sample_docx):
path = tmp_path / "archive.zip"
with zipfile.ZipFile(path, "w") as zf:
zf.write(sample_docx, arcname="inner/sample.docx")
zf.writestr("desktop.ini", "junk")
return path
# --------------------------------------------------------------------------
# synthetic normalized source — paginated, with an activity straddling a
# page boundary so the chunker overlap can be verified.
# --------------------------------------------------------------------------
@pytest.fixture
def paginated_source(tmp_path):
"""A 50-page normalized source. An activity spans the page 20/21 boundary."""
lines = ["SOURCE: synthetic/test.pdf", "CONVERTED: 2026-05-19",
"FORMAT: pdf", "=" * 50, ""]
for n in range(1, 51):
lines.append(f"--- PAGE {n} ---")
if n == 20:
lines.append("ACTIVITY-START jocul podului care traverseaza pagina")
elif n == 21:
lines.append("continuare a jocului podului ACTIVITY-END")
else:
lines.append(f"continut obisnuit pe pagina {n}")
lines.append("")
path = tmp_path / "src_paginated.txt"
path.write_text("\n".join(lines), encoding="utf-8")
return path

3
tests/fixtures/.gitkeep vendored Normal file
View File

@@ -0,0 +1,3 @@
# Test fixtures (synthetic PDF/docx/zip/HTML) are generated at runtime by
# tests/conftest.py — no binary blobs are committed. This file only preserves
# the directory in git.

View File

@@ -0,0 +1,334 @@
# -*- coding: utf-8 -*-
"""
Tests for scripts/build_database.py — the import / dedup / swap side.
Covers: category -> slug + `altele` fallback; dedup across all three threshold
bands; EN != RO never merged; field combination on merge; atomic swap with a
simulated mid-build crash; the source_excerpt substring check.
"""
import json
import os
import sys
from pathlib import Path
import pytest
REPO_ROOT = Path(__file__).resolve().parent.parent
SCRIPTS_DIR = REPO_ROOT / "scripts"
for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
if _p not in sys.path:
sys.path.insert(0, _p)
import build_database as bd # noqa: E402
from app.models.activity import Activity # noqa: E402
from app.models.database import DatabaseManager # noqa: E402
# --------------------------------------------------------------------------
# helpers
# --------------------------------------------------------------------------
def _activity(**over):
base = dict(
name="Jocul testului",
description="O activitate de echipa in aer liber.",
category="team-building",
content_type="joc",
language="ro",
extraction_confidence="high",
)
base.update(over)
return Activity(**base)
def _ext_activity(**over):
"""A schema-valid extraction-JSON activity object."""
base = dict(
name="Jocul testului",
description="O activitate de echipa in aer liber.",
category="team-building",
content_type="joc",
language="ro",
extraction_confidence="high",
source_excerpt="ANCHOR-EXCERPT despre jocul testului",
page_reference="page 1",
)
base.update(over)
return base
def _write_extraction(extracted_dir, chunk_key, activities, source_id="src01"):
extracted_dir.mkdir(parents=True, exist_ok=True)
payload = {
"header": {
"source_hash": "hash1234deadbeef",
"schema_version": "1.0",
"prompt_version": "1.0",
"chunk_range": "pages 1-20",
"source_id": source_id,
"chunk_key": chunk_key,
},
"activities": activities,
}
(extracted_dir / f"{chunk_key}.json").write_text(
json.dumps(payload, ensure_ascii=False), encoding="utf-8"
)
def _write_chunk(chunks_dir, source_id, chunk_key, text):
d = chunks_dir / source_id
d.mkdir(parents=True, exist_ok=True)
(d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
# --------------------------------------------------------------------------
# step 3 — category normalization
# --------------------------------------------------------------------------
def test_category_alias_mapped_to_slug():
act = bd.dict_to_activity(_ext_activity(category="teambuilding"), "s.txt")
assert act.category == "team-building"
def test_unknown_category_falls_back_to_altele():
act = bd.dict_to_activity(_ext_activity(category="zzz-not-a-category"), "s.txt")
assert act.category == "altele"
def test_content_type_normalized():
act = bd.dict_to_activity(_ext_activity(content_type="games"), "s.txt")
assert act.content_type == "joc"
# --------------------------------------------------------------------------
# step 4 — dedup, three bands
# --------------------------------------------------------------------------
def test_dedup_auto_merge_identical_descriptions():
""">= 85 similar -> a single merged row."""
a = _activity(description="copiii formeaza echipe si traverseaza terenul")
b = _activity(description="copiii formeaza echipe si traverseaza terenul")
out, stats = bd.dedup_activities([a, b])
assert len(out) == 1
assert stats["auto_merged"] == 1
assert out[0].needs_review == 0
def test_dedup_borderline_keeps_both_and_flags_needs_review():
"""60-85 similar -> both kept, both flagged needs_review."""
from rapidfuzz import fuzz
d1 = "alpha beta gamma delta epsilon"
d2 = "alpha beta gamma delta epsilon zeta eta theta iota"
score = fuzz.token_sort_ratio(d1, d2)
assert 60.0 <= score < 85.0, f"precondition: score={score} not borderline"
a = _activity(description=d1)
b = _activity(description=d2)
out, stats = bd.dedup_activities([a, b])
assert len(out) == 2
assert stats["borderline"] == 2
assert all(act.needs_review == 1 for act in out)
def test_dedup_low_similarity_kept_as_separate_variants():
"""< 60 similar -> separate variants, no needs_review."""
from rapidfuzz import fuzz
d1 = "alpha beta gamma delta epsilon"
d2 = "quebec romeo sierra tango uniform victor whiskey"
assert fuzz.token_sort_ratio(d1, d2) < 60.0
a = _activity(description=d1)
b = _activity(description=d2)
out, stats = bd.dedup_activities([a, b])
assert len(out) == 2
assert stats["auto_merged"] == 0
assert all(act.needs_review == 0 for act in out)
def test_dedup_never_merges_across_languages():
"""Same name + same description but EN vs RO -> two distinct rows."""
desc = "children form teams and cross the field"
ro = _activity(name="Cursa", description=desc, language="ro")
en = _activity(name="Cursa", description=desc, language="en")
out, stats = bd.dedup_activities([ro, en])
assert len(out) == 2
assert stats["auto_merged"] == 0
langs = {a.language for a in out}
assert langs == {"ro", "en"}
def test_merge_combines_fields():
"""On merge: longest description/rules, union materials, accumulated sources."""
desc = "copiii formeaza echipe si traverseaza terenul cu obstacole"
a = _activity(
description=desc,
rules="regula scurta",
materials_list="franghie, esarfa",
source_file="a.txt",
keywords="echipa",
)
b = _activity(
description=desc,
rules="o regula mult mai lunga si mai detaliata pentru joc",
materials_list="busola, esarfa",
source_file="b.txt",
keywords="cooperare",
)
out, _ = bd.dedup_activities([a, b])
assert len(out) == 1
merged = out[0]
assert merged.rules == "o regula mult mai lunga si mai detaliata pentru joc"
mats = set(m.strip() for m in merged.materials_list.split(","))
assert mats == {"franghie", "esarfa", "busola"}
assert set(merged.source_files) == {"a.txt", "b.txt"}
assert merged.popularity_score == 1
assert set(k.strip() for k in merged.keywords.split(",")) == {"echipa", "cooperare"}
# --------------------------------------------------------------------------
# step 5 — review decisions
# --------------------------------------------------------------------------
def test_review_decision_drop_removes_row():
from import_common import content_key, normalize_name
a = _activity(description="o descriere de test")
key = content_key(normalize_name(a.name), a.language, a.description)
kept, stats = bd.apply_review_decisions([a], {key: {"decision": "drop"}})
assert kept == []
assert stats["dropped"] == 1
def test_review_decision_keep_separate_clears_needs_review():
from import_common import content_key, normalize_name
a = _activity(description="o descriere de test")
a.needs_review = 1
key = content_key(normalize_name(a.name), a.language, a.description)
kept, stats = bd.apply_review_decisions([a], {key: {"decision": "keep-separate"}})
assert len(kept) == 1 and kept[0].needs_review == 0
assert stats["resolved"] == 1
# --------------------------------------------------------------------------
# step 2b — source_excerpt hallucination check
# --------------------------------------------------------------------------
def test_hallucinated_excerpt_activity_dropped(tmp_path):
extracted = tmp_path / "extracted"
chunks = tmp_path / "chunks"
sources = tmp_path / "sources"
good = _ext_activity(
name="Joc real", source_excerpt="textul real apare in bucata sursa"
)
bad = _ext_activity(
name="Joc inventat",
source_excerpt="acest citat nu exista nicaieri in sursa originala xyzzy",
)
_write_extraction(extracted, "src01.part01", [good, bad])
_write_chunk(
chunks, "src01", "src01.part01",
"--- PAGE 1 ---\ntextul real apare in bucata sursa pentru jocul real.\n",
)
from import_common import load_schema
schema = load_schema()
res = bd.collect_activities(extracted, chunks, sources, schema)
names = {a.name for a in res["activities"]}
assert names == {"Joc real"}
assert res["activities_hallucinated"] == 1
assert (extracted / "_rejected").exists()
def test_schema_invalid_file_moved_to_rejected(tmp_path):
extracted = tmp_path / "extracted"
chunks = tmp_path / "chunks"
sources = tmp_path / "sources"
extracted.mkdir(parents=True)
# missing required header keys + bad activity
(extracted / "bad.json").write_text(
json.dumps({"header": {}, "activities": [{"name": "x"}]}),
encoding="utf-8",
)
from import_common import load_schema
res = bd.collect_activities(extracted, chunks, sources, load_schema())
assert res["files_rejected_schema"] == 1
assert not (extracted / "bad.json").exists()
assert (extracted / "_rejected" / "bad.json").exists()
assert (extracted / "_rejected" / "bad.errors.txt").exists()
# --------------------------------------------------------------------------
# end-to-end rebuild + atomic swap
# --------------------------------------------------------------------------
def _setup_corpus(tmp_path):
extracted = tmp_path / "extracted"
chunks = tmp_path / "chunks"
sources = tmp_path / "sources"
excerpt = "jocul testului este o activitate de echipa"
_write_extraction(
extracted, "src01.part01",
[_ext_activity(source_excerpt=excerpt)],
)
_write_chunk(chunks, "src01", "src01.part01",
f"--- PAGE 1 ---\n{excerpt} in aer liber.\n")
return extracted, chunks, sources
def test_rebuild_creates_database(tmp_path):
extracted, chunks, sources = _setup_corpus(tmp_path)
db_path = tmp_path / "activities.db"
report = bd.rebuild(
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
db_path=db_path,
)
assert db_path.exists()
assert report["final_count"] == 1
db = DatabaseManager(str(db_path))
rows = db.search_activities()
assert len(rows) == 1
assert rows[0]["category"] == "team-building"
def test_atomic_swap_keeps_live_db_intact_on_crash(tmp_path, monkeypatch):
"""A mid-build crash must leave the live DB byte-identical."""
extracted, chunks, sources = _setup_corpus(tmp_path)
db_path = tmp_path / "activities.db"
# a pre-existing live DB with sentinel content
live = DatabaseManager(str(db_path))
live.insert_activity(_activity(name="Sentinel viu"))
before = db_path.read_bytes()
def boom(self, *a, **k):
raise RuntimeError("simulated mid-build crash")
monkeypatch.setattr(DatabaseManager, "bulk_insert_activities", boom)
with pytest.raises(RuntimeError, match="simulated mid-build crash"):
bd.rebuild(
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
db_path=db_path,
)
# live DB untouched, tmp cleaned up
assert db_path.read_bytes() == before
assert not (tmp_path / "activities.db.tmp").exists()
def test_rebuild_backs_up_live_db(tmp_path):
extracted, chunks, sources = _setup_corpus(tmp_path)
db_path = tmp_path / "activities.db"
DatabaseManager(str(db_path)).insert_activity(_activity(name="Vechi"))
report = bd.rebuild(
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
db_path=db_path,
)
assert report["backup"] is not None
assert Path(report["backup"]).exists()
assert os.path.basename(report["backup"]) == "activities.db.bak"

183
tests/test_chunk_sources.py Normal file
View File

@@ -0,0 +1,183 @@
# -*- coding: utf-8 -*-
"""Tests for scripts/chunk_sources.py."""
import json
import chunk_sources as cs
import normalize_sources as ns
def _pages(n):
return [(i, f"text-{i}") for i in range(1, n + 1)]
# --------------------------------------------------------------------------
# header parsing
# --------------------------------------------------------------------------
def test_parse_source_splits_header_and_body(paginated_source):
text = paginated_source.read_text(encoding="utf-8")
header, body = cs.parse_source(text)
assert header["FORMAT"] == "pdf"
assert body.lstrip().startswith("--- PAGE 1 ---")
# --------------------------------------------------------------------------
# page chunking
# --------------------------------------------------------------------------
def test_chunk_pages_basic_split():
chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
# stride 16: starts at pages 1, 17, 33, ...
assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 20
assert chunks[1]["page_start"] == 17
assert chunks[-1]["page_end"] == 50
def test_chunk_pages_have_overlap():
chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
overlap = chunks[0]["page_end"] - chunks[1]["page_start"] + 1
assert overlap == 4
def test_chunk_pages_short_document_single_chunk():
chunks = cs.chunk_pages(_pages(8), pages_per_chunk=20, overlap=4)
assert len(chunks) == 1
assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 8
def test_chunk_pages_empty():
assert cs.chunk_pages([]) == []
def test_activity_at_page_boundary_intact_in_one_chunk(paginated_source):
"""An activity straddling the page 20/21 boundary must appear whole in >=1 chunk."""
text = paginated_source.read_text(encoding="utf-8")
chunks = cs.make_chunks(text)
full = [
c for c in chunks
if "ACTIVITY-START" in c["text"] and "ACTIVITY-END" in c["text"]
]
assert full, "activity spanning a page boundary was split across all chunks"
# --------------------------------------------------------------------------
# word-window chunking for unpaginated text
# --------------------------------------------------------------------------
def test_chunk_words_window_and_overlap():
text = " ".join(f"w{i}" for i in range(25_000))
chunks = cs.chunk_words(text, window=10_000, overlap=2_000)
assert len(chunks) == 3 # stride 8000 over 25000 words
first = chunks[0]["text"].split()
second = chunks[1]["text"].split()
assert first[8_000:10_000] == second[0:2_000] # 2000-word overlap
def test_make_chunks_unpaginated_uses_word_windows():
body = "cuvant " * 15_000
text = "SOURCE: x\nFORMAT: txt\n" + "=" * 50 + "\n\n" + body
chunks = cs.make_chunks(text)
assert len(chunks) >= 2
assert chunks[0]["chunk_range"].startswith("words")
# --------------------------------------------------------------------------
# stable source ids — anti-collision
# --------------------------------------------------------------------------
def test_stable_id_same_stem_different_path_no_collision():
a = ns.stable_id("camp/games/scout.pdf")
b = ns.stable_id("school/lessons/scout.pdf")
assert a != b
assert a.endswith("_scout") and b.endswith("_scout")
def test_stable_id_deterministic():
assert ns.stable_id("a/b/c.pdf") == ns.stable_id("a/b/c.pdf")
# --------------------------------------------------------------------------
# manifest registry + idempotency
# --------------------------------------------------------------------------
def test_run_writes_chunks_and_manifest(paginated_source, tmp_path):
sources_dir = tmp_path / "sources"
sources_dir.mkdir()
(sources_dir / paginated_source.name).write_text(
paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
)
chunks_dir = tmp_path / "chunks"
summary = cs.run(sources_dir, chunks_dir)
assert summary["sources"] == 1
assert summary["chunks"] >= 2
manifest = json.loads((chunks_dir / "manifest.json").read_text())
assert manifest["chunks"]
for key, meta in manifest["chunks"].items():
assert meta["state"] == "pending"
assert meta["expected_json"] == f"{key}.json"
assert (chunks_dir.parent / meta["chunk_file"]).exists()
def test_manifest_idempotent_preserves_state(paginated_source, tmp_path):
sources_dir = tmp_path / "sources"
sources_dir.mkdir()
(sources_dir / paginated_source.name).write_text(
paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
)
chunks_dir = tmp_path / "chunks"
manifest_path = chunks_dir / "manifest.json"
cs.run(sources_dir, chunks_dir)
# orchestrator marks one chunk done
manifest = json.loads(manifest_path.read_text())
first_key = next(iter(manifest["chunks"]))
n_before = len(manifest["chunks"])
manifest["chunks"][first_key]["state"] = "done"
manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
# re-run: 'done' must survive, no chunk added or lost
cs.run(sources_dir, chunks_dir)
manifest2 = json.loads(manifest_path.read_text())
assert len(manifest2["chunks"]) == n_before
assert manifest2["chunks"][first_key]["state"] == "done"
assert all(
m["state"] in ("pending", "done") for m in manifest2["chunks"].values()
)
def test_manifest_resets_state_when_source_changes(paginated_source, tmp_path):
sources_dir = tmp_path / "sources"
sources_dir.mkdir()
src = sources_dir / paginated_source.name
src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
chunks_dir = tmp_path / "chunks"
manifest_path = chunks_dir / "manifest.json"
cs.run(sources_dir, chunks_dir)
manifest = json.loads(manifest_path.read_text())
first_key = next(iter(manifest["chunks"]))
manifest["chunks"][first_key]["state"] = "done"
manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
# mutate the source content -> hash changes -> state resets
src.write_text(src.read_text(encoding="utf-8") + "\n--- PAGE 51 ---\nextra\n",
encoding="utf-8")
cs.run(sources_dir, chunks_dir)
manifest2 = json.loads(manifest_path.read_text())
assert manifest2["chunks"][first_key]["state"] == "pending"
def test_prune_stale_removes_orphan_entries(paginated_source, tmp_path):
sources_dir = tmp_path / "sources"
sources_dir.mkdir()
src = sources_dir / paginated_source.name
src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
chunks_dir = tmp_path / "chunks"
cs.run(sources_dir, chunks_dir)
# delete the source -> its chunks become stale
src.unlink()
summary = cs.run(sources_dir, chunks_dir)
assert summary["chunks"] == 0
assert summary["pruned"] >= 1
manifest = json.loads((chunks_dir / "manifest.json").read_text())
assert manifest["chunks"] == {}

View File

@@ -0,0 +1,177 @@
# -*- coding: utf-8 -*-
"""Tests for scripts/extract_common.py."""
import shutil
import zipfile
import pytest
import extract_common as ec
# --------------------------------------------------------------------------
# format detection
# --------------------------------------------------------------------------
def test_detect_format():
assert ec.detect_format("a/b/file.PDF") == "pdf"
assert ec.detect_format("x.docx") == "docx"
assert ec.detect_format("x.doc") == "doc"
assert ec.detect_format("x.pptx") == "pptx"
assert ec.detect_format("x.html") == "html"
assert ec.detect_format("x.zip") == "zip"
assert ec.detect_format("x.epub") == "epub"
assert ec.detect_format("x.xyz") == "unknown"
def test_is_junk():
assert ec.is_junk("some/desktop.ini")
assert ec.is_junk("notes.bak")
assert ec.is_junk("README.md")
assert not ec.is_junk("1000 Scout Games.pdf")
# --------------------------------------------------------------------------
# PDF — the critical "no max_pages" regression
# --------------------------------------------------------------------------
def test_pdf_extracts_all_60_pages(big_pdf):
body = ec.extract_pdf(big_pdf)
# the old converter capped at 50 pages — page 60 must be present now
assert "--- PAGE 60 ---" in body
assert "PDFMARK-60" in body
assert ec.count_page_markers(body) == 60
def test_pdf_does_not_truncate_mid_document(big_pdf):
body = ec.extract_pdf(big_pdf)
pages = ec.split_pages(body)
assert pages[-1][0] == 60 # last marker is the real last page
# --------------------------------------------------------------------------
# page join / split round-trip
# --------------------------------------------------------------------------
def test_join_split_round_trip():
body = ec.join_pages(["alpha", "beta", "gamma"])
pages = ec.split_pages(body)
assert [n for n, _ in pages] == [1, 2, 3]
assert [t for _, t in pages] == ["alpha", "beta", "gamma"]
def test_split_pages_no_markers_returns_empty():
assert ec.split_pages("plain text with no markers") == []
# --------------------------------------------------------------------------
# docx — synthetic page markers
# --------------------------------------------------------------------------
def test_docx_synthetic_page_markers(sample_docx):
body = ec.extract_docx(sample_docx)
# 100 paragraphs / 40 per page => 3 pages
assert ec.count_page_markers(body) == 3
assert "Paragraf 99" in body
# --------------------------------------------------------------------------
# HTML mirror — nav/script/footer stripped
# --------------------------------------------------------------------------
def test_html_strips_chrome(html_with_nav):
body = ec.extract_html(html_with_nav)
assert "Vanatoarea de comori" in body
assert "joc real de orientare" in body
# chrome must be gone
assert "tracking" not in body
assert "Site Banner Junk" not in body
assert "toate drepturile rezervate" not in body
assert "Games" not in body
# --------------------------------------------------------------------------
# content hash + near-duplicate elimination
# --------------------------------------------------------------------------
def test_content_hash_ignores_whitespace():
assert ec.content_hash("hello world") == ec.content_hash("hello world\n")
assert ec.content_hash("hello world") != ec.content_hash("goodbye world")
def test_dedupe_exact_duplicates():
items = [("a", "joc identic"), ("b", "joc identic"), ("c", "alt joc")]
kept = ec.dedupe_texts(items)
assert [k for k, _ in kept] == ["a", "c"]
def test_dedupe_near_duplicates():
base = "Vanatoarea de comori este un joc de orientare pentru cercetasi in tabara."
near = base + " Pagina printata." # >95% similar
items = [("orig", base), ("print", near), ("other", "Cu totul alt continut diferit aici.")]
kept = ec.dedupe_texts(items, threshold=85.0)
keys = [k for k, _ in kept]
assert "orig" in keys
assert "print" not in keys
assert "other" in keys
# --------------------------------------------------------------------------
# zip recursion
# --------------------------------------------------------------------------
def test_zip_recurses_into_inner_files(sample_zip):
body = ec.extract_zip(sample_zip)
assert "Paragraf 0" in body
assert ec.count_page_markers(body) > 0
def test_zip_bad_archive_returns_empty(tmp_path):
bad = tmp_path / "broken.zip"
bad.write_text("not a zip", encoding="utf-8")
assert ec.extract_zip(bad) == ""
def test_nested_zip(tmp_path, sample_zip):
outer = tmp_path / "outer.zip"
with zipfile.ZipFile(outer, "w") as zf:
zf.write(sample_zip, arcname="nested/archive.zip")
body = ec.extract_zip(outer)
assert "Paragraf 0" in body
# --------------------------------------------------------------------------
# preflight
# --------------------------------------------------------------------------
def test_preflight_python_packages_present():
report = ec.preflight()
# all required packages are installed in the test environment
assert report["missing_python"] == []
def test_preflight_reports_libreoffice_state():
report = ec.preflight()
has_lo = bool(shutil.which("libreoffice") or shutil.which("soffice"))
if has_lo:
assert all("libreoffice" not in w for w in report["warnings"])
else:
assert any("libreoffice" in w for w in report["warnings"])
def test_preflight_ocr_flag():
report = ec.preflight(check_ocr=True)
if not shutil.which("tesseract"):
assert any("tesseract" in m for m in report["missing_system"])
# --------------------------------------------------------------------------
# legacy .doc — skipped unless libreoffice is installed
# --------------------------------------------------------------------------
@pytest.mark.skipif(
not (shutil.which("libreoffice") or shutil.which("soffice")),
reason="libreoffice not installed",
)
def test_doc_conversion(tmp_path, sample_docx):
doc_path = tmp_path / "legacy.doc"
shutil.copy(sample_docx, doc_path) # smoke test of the docx path
body = ec.extract_doc(doc_path)
assert ec.count_page_markers(body) >= 1
def test_doc_without_libreoffice_raises(tmp_path, monkeypatch):
monkeypatch.setattr(ec.shutil, "which", lambda _: None)
with pytest.raises(RuntimeError):
ec.extract_doc(tmp_path / "whatever.doc")

139
tests/test_fts.py Normal file
View File

@@ -0,0 +1,139 @@
"""
Integration tests for the FTS5 search index.
Confirms that materials_list and skills_developed are indexed by FTS5 and kept
in sync by the insert / update / delete triggers (plan §6, §7).
"""
import os
import sys
import json
import pytest
# Make the project root importable when pytest is run from anywhere.
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
if PROJECT_ROOT not in sys.path:
sys.path.insert(0, PROJECT_ROOT)
from app.models.activity import Activity # noqa: E402
from app.models.database import DatabaseManager # noqa: E402
@pytest.fixture
def db(tmp_path):
"""A fresh DatabaseManager backed by a temporary SQLite file."""
return DatabaseManager(str(tmp_path / "test_activities.db"))
def _make_activity(**overrides):
base = dict(
name="Vânătoarea de comori",
description="O activitate de echipă în aer liber.",
category="camp-outdoor",
content_type="joc",
source_file="test.txt",
language="ro",
)
base.update(overrides)
return Activity(**base)
def test_search_by_materials_list(db):
"""A term that only appears in materials_list returns the activity."""
activity = _make_activity(materials_list="frânghie, eșarfă, busolă")
db.insert_activity(activity)
results = db.search_activities(search_text="busolă")
assert len(results) == 1
assert results[0]["name"] == "Vânătoarea de comori"
def test_search_by_skills_developed(db):
"""A term that only appears in skills_developed returns the activity."""
activity = _make_activity(skills_developed="comunicare, leadership, rabdare")
db.insert_activity(activity)
results = db.search_activities(search_text="leadership")
assert len(results) == 1
assert results[0]["name"] == "Vânătoarea de comori"
def test_term_absent_from_indexed_columns_no_hit(db):
"""A term present in no indexed column yields no hit (control)."""
db.insert_activity(_make_activity(materials_list="frânghie"))
assert db.search_activities(search_text="zzzunlikelyterm") == []
def test_delete_trigger_removes_from_fts(db):
"""Deleting an activity removes it from the FTS index (delete trigger)."""
activity = _make_activity(materials_list="catalige")
activity_id = db.insert_activity(activity)
assert len(db.search_activities(search_text="catalige")) == 1
with db._get_connection() as conn:
conn.execute("DELETE FROM activities WHERE id = ?", (activity_id,))
conn.commit()
assert db.search_activities(search_text="catalige") == []
def test_update_trigger_resyncs_fts(db):
"""Updating materials_list re-syncs the FTS index (update trigger)."""
activity = _make_activity(materials_list="creioane")
activity_id = db.insert_activity(activity)
assert len(db.search_activities(search_text="creioane")) == 1
with db._get_connection() as conn:
conn.execute(
"UPDATE activities SET materials_list = ? WHERE id = ?",
("acuarele", activity_id),
)
conn.commit()
# Old term gone, new term found.
assert db.search_activities(search_text="creioane") == []
assert len(db.search_activities(search_text="acuarele")) == 1
def test_rebuild_fts_index(db):
"""rebuild_fts_index keeps materials_list / skills_developed searchable."""
db.insert_activity(_make_activity(skills_developed="orientare"))
db.rebuild_fts_index()
assert len(db.search_activities(search_text="orientare")) == 1
def test_new_schema_columns_round_trip(db):
"""New activity columns persist and load back via from_dict."""
activity = _make_activity(
source_files=["a.txt", "b.txt"],
source_excerpt="Citat scurt din sursă.",
extraction_confidence="high",
needs_review=1,
normalized_name="vanatoarea de comori",
)
activity_id = db.insert_activity(activity)
row = db.get_activity_by_id(activity_id)
assert row["content_type"] == "joc"
assert row["language"] == "ro"
assert row["extraction_confidence"] == "high"
assert row["needs_review"] == 1
assert row["normalized_name"] == "vanatoarea de comori"
assert json.loads(row["source_files"]) == ["a.txt", "b.txt"]
assert row["source_excerpt"] == "Citat scurt din sursă."
loaded = Activity.from_dict(row)
assert loaded.source_files == ["a.txt", "b.txt"]
assert loaded.content_type == "joc"
def test_normalized_name_auto_derived(db):
"""normalized_name is auto-derived from name when not provided."""
activity = Activity(
name="Ștafetă cu Obstacole",
description="desc",
category="sports-active",
source_file="t.txt",
)
assert activity.normalized_name == "stafeta cu obstacole"

140
tests/test_search.py Normal file
View File

@@ -0,0 +1,140 @@
# -*- coding: utf-8 -*-
"""
CRITICAL REGRESSION TEST (plan §6, §7).
`search.py` changed the result sets of /search and /api/search: the default
search now EXCLUDES the non-game content types (rețetă / cântec / ceremonie),
which surface only when the user explicitly filters that content_type or picks
a non-game category. This test guards that behaviour.
"""
import pytest
from app.models.activity import Activity
from app.models.database import DatabaseManager
from app.services.search import SearchService
from app.config_taxonomy import NON_GAME_CONTENT_TYPES
# --------------------------------------------------------------------------
# fixtures
# --------------------------------------------------------------------------
def _activity(name, content_type, category="altele", language="ro"):
return Activity(
name=name,
description=f"Descriere pentru {name}, un conținut de tip {content_type}.",
category=category,
content_type=content_type,
language=language,
source_file="test/fixture.txt",
)
@pytest.fixture
def search_service(tmp_path):
"""A SearchService over a temp DB seeded with one row per content_type."""
db = DatabaseManager(str(tmp_path / "activities.db"))
db.clear_database()
db.bulk_insert_activities([
_activity("Vanatoarea de comori", "joc", category="wide-games"),
_activity("Cercul de cunoastere", "activitate", category="icebreakers"),
_activity("Reteta de paine la ceaun", "reteta", category="retete"),
_activity("Cantecul de tabara", "cantec", category="cantece-ceremonii"),
_activity("Ceremonia de inchidere", "ceremonie", category="cantece-ceremonii"),
_activity("Game in English", "joc", category="wide-games", language="en"),
])
return SearchService(db)
def _content_types(results):
return {r.get("content_type") for r in results}
# --------------------------------------------------------------------------
# the regression: default search excludes non-game content types
# --------------------------------------------------------------------------
def test_default_search_excludes_non_game_content(search_service):
"""No filters → rețete / cântece / ceremonii must NOT appear."""
results = search_service.search_activities()
types = _content_types(results)
assert types, "default search returned nothing"
for non_game in NON_GAME_CONTENT_TYPES:
assert non_game not in types, (
f"default search leaked non-game content_type '{non_game}'"
)
# game content is still present
assert "joc" in types
assert "activitate" in types
def test_default_search_with_text_excludes_non_game(search_service):
"""A text query still excludes non-game content by default."""
results = search_service.search_activities(search_text="conținut")
assert NON_GAME_CONTENT_TYPES[0] not in _content_types(results)
# --------------------------------------------------------------------------
# explicit content_type filter INCLUDES the non-game rows
# --------------------------------------------------------------------------
def test_explicit_content_type_filter_includes_non_game(search_service):
"""Filtering content_type=reteta returns exactly the rețete."""
results = search_service.search_activities(filters={"content_type": "reteta"})
types = _content_types(results)
assert types == {"reteta"}, f"expected only rețete, got {types}"
assert len(results) == 1
def test_explicit_content_type_filter_for_cantec(search_service):
results = search_service.search_activities(filters={"content_type": "cantec"})
assert _content_types(results) == {"cantec"}
# --------------------------------------------------------------------------
# a non-game CATEGORY filter also lifts the exclusion
# --------------------------------------------------------------------------
def test_non_game_category_filter_includes_non_game(search_service):
"""Picking category=cantece-ceremonii surfaces cântece + ceremonii."""
results = search_service.search_activities(
filters={"category": "cantece-ceremonii"})
types = _content_types(results)
assert "cantec" in types
assert "ceremonie" in types
def test_game_category_filter_still_excludes_non_game(search_service):
"""A normal (game) category filter keeps the non-game exclusion."""
results = search_service.search_activities(filters={"category": "wide-games"})
types = _content_types(results)
for non_game in NON_GAME_CONTENT_TYPES:
assert non_game not in types
# --------------------------------------------------------------------------
# language filter
# --------------------------------------------------------------------------
def test_language_filter_ro(search_service):
results = search_service.search_activities(filters={"language": "ro"})
assert results
assert all(r.get("language") == "ro" for r in results)
def test_language_filter_en(search_service):
results = search_service.search_activities(filters={"language": "en"})
assert results
assert all(r.get("language") == "en" for r in results)
assert {r.get("name") for r in results} == {"Game in English"}
# --------------------------------------------------------------------------
# get_filter_options surfaces the new axes
# --------------------------------------------------------------------------
def test_filter_options_include_content_type_and_language(search_service):
"""The dynamic-filter mechanism now exposes content_type + language."""
options = search_service.db.get_filter_options()
assert "content_type" in options
assert "language" in options
assert "joc" in options["content_type"]
assert set(options["language"]) == {"ro", "en"}

View File

@@ -0,0 +1,156 @@
# -*- coding: utf-8 -*-
"""
Tests for scripts/validate_extractions.py.
Covers: schema rejection, the source_excerpt hallucination check, the content
of the generated re-extraction prompt, and the manifest `rejected` marking.
"""
import json
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
SCRIPTS_DIR = REPO_ROOT / "scripts"
for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
if _p not in sys.path:
sys.path.insert(0, _p)
import validate_extractions as ve # noqa: E402
# --------------------------------------------------------------------------
# helpers
# --------------------------------------------------------------------------
def _ext_activity(**over):
base = dict(
name="Jocul testului",
description="O activitate de echipa in aer liber.",
category="team-building",
content_type="joc",
language="ro",
extraction_confidence="high",
source_excerpt="ancora din bucata sursa",
page_reference="page 1",
)
base.update(over)
return base
def _write_extraction(extracted_dir, chunk_key, activities, header_extra=None):
extracted_dir.mkdir(parents=True, exist_ok=True)
header = {
"source_hash": "hash1234deadbeef",
"schema_version": "1.0",
"prompt_version": "1.0",
"chunk_range": "pages 1-20",
"source_id": "src01",
"chunk_key": chunk_key,
}
if header_extra:
header.update(header_extra)
payload = {"header": header, "activities": activities}
(extracted_dir / f"{chunk_key}.json").write_text(
json.dumps(payload, ensure_ascii=False), encoding="utf-8"
)
def _write_chunk(chunks_dir, source_id, chunk_key, text):
d = chunks_dir / source_id
d.mkdir(parents=True, exist_ok=True)
(d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
# --------------------------------------------------------------------------
# tests
# --------------------------------------------------------------------------
def test_valid_file_passes(tmp_path):
extracted = tmp_path / "extracted"
chunks = tmp_path / "chunks"
excerpt = "ancora din bucata sursa apare aici"
_write_extraction(extracted, "src01.part01", [_ext_activity(source_excerpt=excerpt)])
_write_chunk(chunks, "src01", "src01.part01", f"--- PAGE 1 ---\n{excerpt}\n")
report = ve.run(extracted, chunks, tmp_path / "manifest.json")
assert report["valid"] == 1
assert report["rejected"] == 0
def test_schema_invalid_file_rejected(tmp_path):
extracted = tmp_path / "extracted"
chunks = tmp_path / "chunks"
extracted.mkdir(parents=True)
(extracted / "src01.part01.json").write_text(
json.dumps({"header": {}, "activities": [{"name": "x"}]}), encoding="utf-8"
)
report = ve.run(extracted, chunks, tmp_path / "manifest.json")
assert report["rejected"] == 1
prompt = extracted / "_reextract" / "src01.part01.prompt.md"
assert prompt.exists()
def test_hallucinated_excerpt_rejected(tmp_path):
extracted = tmp_path / "extracted"
chunks = tmp_path / "chunks"
_write_extraction(
extracted, "src01.part01",
[_ext_activity(source_excerpt="citat complet inventat care nu exista qqqq")],
)
_write_chunk(chunks, "src01", "src01.part01",
"--- PAGE 1 ---\ntext complet diferit despre altceva.\n")
report = ve.run(extracted, chunks, tmp_path / "manifest.json")
assert report["rejected"] == 1
errors = report["rejected_chunks"][0]["errors"]
assert any("hallucination" in e for e in errors)
def test_reextraction_prompt_content(tmp_path):
extracted = tmp_path / "extracted"
chunks = tmp_path / "chunks"
_write_extraction(
extracted, "src01.part01",
[_ext_activity(source_excerpt="citat inventat care nu exista zzzz")],
)
_write_chunk(chunks, "src01", "src01.part01",
"--- PAGE 1 ---\ntext despre cu totul altceva aici.\n")
ve.run(extracted, chunks, tmp_path / "manifest.json")
prompt = (extracted / "_reextract" / "src01.part01.prompt.md").read_text(
encoding="utf-8"
)
assert "src01.part01" in prompt
assert "REJECTED" in prompt
assert "verbatim" in prompt
assert "data/extracted/src01.part01.json" in prompt
def test_manifest_marks_chunk_rejected(tmp_path):
extracted = tmp_path / "extracted"
chunks = tmp_path / "chunks"
manifest_path = tmp_path / "manifest.json"
manifest_path.write_text(
json.dumps({"chunks": {"src01.part01": {"state": "done",
"chunk_file": "chunks/src01/src01.part01.txt"}}}),
encoding="utf-8",
)
_write_extraction(
extracted, "src01.part01",
[_ext_activity(source_excerpt="citat fabricat absent vvvv")],
)
_write_chunk(chunks, "src01", "src01.part01",
"--- PAGE 1 ---\nun continut neinrudit.\n")
ve.run(extracted, chunks, manifest_path)
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
assert manifest["chunks"]["src01.part01"]["state"] == "rejected"
def test_build_reextraction_prompt_lists_errors():
prompt = ve.build_reextraction_prompt(
"abc.part03", "data/chunks/abc/abc.part03.txt",
["header: 'source_hash' is a required property"],
)
assert "abc.part03" in prompt
assert "source_hash" in prompt