Rebuild extraction pipeline infrastructure (Faza 0 prep)
Implements the approved plan to replace the broken regex/index-master extraction with an LLM-subagent pipeline. Four parallel lanes: Lane A — scripts/extract_common.py (PDF/docx/doc/pptx/html/zip, no max_pages truncation), normalize_sources.py, chunk_sources.py (~20pg chunks + overlap, manifest registry), activity_schema.json. Lane B — app/config_taxonomy.py (16 fixed category slugs), schema rebuilt from scratch in app/models/ with content_type, language, source_files, source_excerpt, normalized_name, extraction_confidence, needs_review; FTS5 + 3 triggers extended with materials_list and skills_developed. Lane C — build_database.py (--rebuild, atomic swap, schema + fuzzy source_excerpt validation, dedup with needs_review band), validate_extractions.py, review_queue.py, new run_extraction.py orchestrator, SUBAGENT_PROMPT.md. Lane D — search.py content_type/language filters (default search excludes non-game content), E7 schema-compat audit; fixed a NULL keywords AttributeError in _boost_search_relevance. Removes 8 orphaned/dead scripts and app/services/parser.py + indexer.py. Adds tests/ (70 passing, 1 skipped — libreoffice absent). Note: Lane D made one additive edit to app/models/database.py (_update_category_counts) to surface content_type/language in get_filter_options, outside its nominal lane boundary but after Lane B completed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
230
app/config_taxonomy.py
Normal file
230
app/config_taxonomy.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
Controlled category taxonomy for game-library.
|
||||
|
||||
Single source of truth for activity categories. The DB stores the *slug*;
|
||||
the UI displays the Romanian name. `category` (thematic domain) and
|
||||
`content_type` (form of the content) are INDEPENDENT axes — see plan §2.
|
||||
"""
|
||||
|
||||
import unicodedata
|
||||
import re
|
||||
from typing import Dict, List
|
||||
|
||||
# --- Categories (thematic domain) --------------------------------------------
|
||||
# slug -> Romanian display name. ~16 fixed slugs; `altele` is the mandatory
|
||||
# fallback and MUST always be present.
|
||||
CATEGORIES: Dict[str, str] = {
|
||||
"jocuri-cercetasesti": "Jocuri cercetășești",
|
||||
"team-building": "Team-building",
|
||||
"icebreakers": "Icebreakers / spargerea gheții",
|
||||
"camp-outdoor": "Tabără și activități în aer liber",
|
||||
"wide-games": "Wide games / jocuri de teren",
|
||||
"orientare": "Orientare",
|
||||
"prim-ajutor": "Prim ajutor",
|
||||
"escape-room-puzzle": "Escape room și puzzle",
|
||||
"creative-stem": "Creativitate și STEM",
|
||||
"sports-active": "Sport și activități fizice",
|
||||
"cantece-ceremonii": "Cântece și ceremonii",
|
||||
"retete": "Rețete",
|
||||
"supravietuire": "Supraviețuire",
|
||||
"integrare-incluziune": "Integrare și incluziune",
|
||||
"conflict-empatie": "Conflict și empatie",
|
||||
"altele": "Altele",
|
||||
}
|
||||
|
||||
# Mandatory fallback slug.
|
||||
FALLBACK_CATEGORY = "altele"
|
||||
|
||||
# Ordered list of valid slugs.
|
||||
CATEGORY_SLUGS: List[str] = list(CATEGORIES.keys())
|
||||
|
||||
# --- Content type (form of the content) --------------------------------------
|
||||
# Independent axis from `category`. The UI default search excludes the
|
||||
# non-game content types (see plan §6).
|
||||
CONTENT_TYPES: Dict[str, str] = {
|
||||
"joc": "Joc",
|
||||
"activitate": "Activitate",
|
||||
"reteta": "Rețetă",
|
||||
"cantec": "Cântec",
|
||||
"ceremonie": "Ceremonie",
|
||||
}
|
||||
|
||||
CONTENT_TYPE_SLUGS: List[str] = list(CONTENT_TYPES.keys())
|
||||
|
||||
# Content types considered "non-game" — excluded from the default UI search.
|
||||
NON_GAME_CONTENT_TYPES: List[str] = ["reteta", "cantec", "ceremonie"]
|
||||
|
||||
DEFAULT_CONTENT_TYPE = "activitate"
|
||||
|
||||
# --- Aliases -----------------------------------------------------------------
|
||||
# Map of normalized arbitrary strings -> canonical slug. Keys are already
|
||||
# diacritic-stripped, lowercased and hyphenated (see _slugify). This catches
|
||||
# legacy / messy values from the old DB and common English/Romanian variants.
|
||||
_CATEGORY_ALIASES: Dict[str, str] = {
|
||||
# legacy junk
|
||||
"general-activity": "altele",
|
||||
"general": "altele",
|
||||
"educational": "creative-stem",
|
||||
"d": "altele",
|
||||
"a": "altele",
|
||||
"b": "altele",
|
||||
"c": "altele",
|
||||
# scouting
|
||||
"cercetasie": "jocuri-cercetasesti",
|
||||
"cercetasesti": "jocuri-cercetasesti",
|
||||
"scout": "jocuri-cercetasesti",
|
||||
"scouting": "jocuri-cercetasesti",
|
||||
"scout-games": "jocuri-cercetasesti",
|
||||
"jocuri-cercetasesti": "jocuri-cercetasesti",
|
||||
# team building
|
||||
"teambuilding": "team-building",
|
||||
"team": "team-building",
|
||||
"cooperare": "team-building",
|
||||
# icebreakers
|
||||
"icebreaker": "icebreakers",
|
||||
"spargerea-ghetii": "icebreakers",
|
||||
"cunoastere": "icebreakers",
|
||||
"energizers": "icebreakers",
|
||||
"energizer": "icebreakers",
|
||||
# camp / outdoor
|
||||
"camp": "camp-outdoor",
|
||||
"tabara": "camp-outdoor",
|
||||
"outdoor": "camp-outdoor",
|
||||
"aer-liber": "camp-outdoor",
|
||||
# wide games
|
||||
"wide-game": "wide-games",
|
||||
"jocuri-de-teren": "wide-games",
|
||||
"joc-de-teren": "wide-games",
|
||||
"big-games": "wide-games",
|
||||
# orientare
|
||||
"orienteering": "orientare",
|
||||
"navigatie": "orientare",
|
||||
# prim ajutor
|
||||
"first-aid": "prim-ajutor",
|
||||
"primul-ajutor": "prim-ajutor",
|
||||
# escape room / puzzle
|
||||
"escape-room": "escape-room-puzzle",
|
||||
"escaperoom": "escape-room-puzzle",
|
||||
"puzzle": "escape-room-puzzle",
|
||||
"puzzles": "escape-room-puzzle",
|
||||
"ghicitori": "escape-room-puzzle",
|
||||
# creative / stem
|
||||
"creative": "creative-stem",
|
||||
"creativitate": "creative-stem",
|
||||
"stem": "creative-stem",
|
||||
"arts-and-crafts": "creative-stem",
|
||||
"craft": "creative-stem",
|
||||
"crafts": "creative-stem",
|
||||
"stiinta": "creative-stem",
|
||||
# sports
|
||||
"sport": "sports-active",
|
||||
"sports": "sports-active",
|
||||
"sportive": "sports-active",
|
||||
"active": "sports-active",
|
||||
"miscare": "sports-active",
|
||||
"physical": "sports-active",
|
||||
# songs / ceremonies
|
||||
"cantece": "cantece-ceremonii",
|
||||
"cantec": "cantece-ceremonii",
|
||||
"songs": "cantece-ceremonii",
|
||||
"ceremonii": "cantece-ceremonii",
|
||||
"ceremonie": "cantece-ceremonii",
|
||||
"ceremony": "cantece-ceremonii",
|
||||
# recipes
|
||||
"reteta": "retete",
|
||||
"recipe": "retete",
|
||||
"recipes": "retete",
|
||||
"cooking": "retete",
|
||||
"gatit": "retete",
|
||||
# survival
|
||||
"survival": "supravietuire",
|
||||
"supravietuire": "supravietuire",
|
||||
# inclusion
|
||||
"integrare": "integrare-incluziune",
|
||||
"incluziune": "integrare-incluziune",
|
||||
"inclusion": "integrare-incluziune",
|
||||
# conflict / empathy
|
||||
"conflict": "conflict-empatie",
|
||||
"empatie": "conflict-empatie",
|
||||
"empathy": "conflict-empatie",
|
||||
"rezolvarea-conflictelor": "conflict-empatie",
|
||||
# fallback
|
||||
"altele": "altele",
|
||||
"other": "altele",
|
||||
"others": "altele",
|
||||
"misc": "altele",
|
||||
}
|
||||
|
||||
|
||||
def _slugify(value: str) -> str:
|
||||
"""Lowercase, strip diacritics, collapse non-alphanumerics to hyphens."""
|
||||
if not value:
|
||||
return ""
|
||||
# Decompose accents (ă -> a, ș -> s, ț -> t, etc.)
|
||||
decomposed = unicodedata.normalize("NFKD", value)
|
||||
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
|
||||
ascii_str = ascii_str.lower().strip()
|
||||
ascii_str = re.sub(r"[^a-z0-9]+", "-", ascii_str)
|
||||
return ascii_str.strip("-")
|
||||
|
||||
|
||||
def normalize_category(value: str) -> str:
|
||||
"""Map an arbitrary string to a valid category slug.
|
||||
|
||||
Returns one of CATEGORY_SLUGS, falling back to `altele` for anything
|
||||
unrecognised or empty.
|
||||
"""
|
||||
if not value:
|
||||
return FALLBACK_CATEGORY
|
||||
slug = _slugify(str(value))
|
||||
if not slug:
|
||||
return FALLBACK_CATEGORY
|
||||
# Exact slug match.
|
||||
if slug in CATEGORIES:
|
||||
return slug
|
||||
# Alias match.
|
||||
if slug in _CATEGORY_ALIASES:
|
||||
return _CATEGORY_ALIASES[slug]
|
||||
return FALLBACK_CATEGORY
|
||||
|
||||
|
||||
def normalize_content_type(value: str) -> str:
|
||||
"""Map an arbitrary string to a valid content_type slug.
|
||||
|
||||
Returns one of CONTENT_TYPE_SLUGS, falling back to `activitate`.
|
||||
"""
|
||||
if not value:
|
||||
return DEFAULT_CONTENT_TYPE
|
||||
slug = _slugify(str(value))
|
||||
if slug in CONTENT_TYPES:
|
||||
return slug
|
||||
# Light alias handling for plural / English forms.
|
||||
aliases = {
|
||||
"jocuri": "joc",
|
||||
"game": "joc",
|
||||
"games": "joc",
|
||||
"activitati": "activitate",
|
||||
"activity": "activitate",
|
||||
"retete": "reteta",
|
||||
"recipe": "reteta",
|
||||
"cantece": "cantec",
|
||||
"song": "cantec",
|
||||
"ceremonii": "ceremonie",
|
||||
"ceremony": "ceremonie",
|
||||
}
|
||||
return aliases.get(slug, DEFAULT_CONTENT_TYPE)
|
||||
|
||||
|
||||
def is_valid_category(slug: str) -> bool:
|
||||
"""True if `slug` is a valid category slug."""
|
||||
return slug in CATEGORIES
|
||||
|
||||
|
||||
def category_display_name(slug: str) -> str:
|
||||
"""Romanian display name for a slug (fallback to the slug itself)."""
|
||||
return CATEGORIES.get(slug, slug)
|
||||
|
||||
|
||||
def content_type_display_name(slug: str) -> str:
|
||||
"""Romanian display name for a content_type slug."""
|
||||
return CONTENT_TYPES.get(slug, slug)
|
||||
@@ -5,6 +5,22 @@ Activity data model for INDEX-SISTEM-JOCURI v2.0
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Dict, Any
|
||||
import json
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
|
||||
def normalize_name(name: str) -> str:
|
||||
"""Diacritic-free, lowercased, whitespace-collapsed form of a name.
|
||||
|
||||
Used as the exact-match key for dedup grouping (see plan §4).
|
||||
"""
|
||||
if not name:
|
||||
return ""
|
||||
decomposed = unicodedata.normalize("NFKD", name)
|
||||
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
|
||||
ascii_str = ascii_str.lower().strip()
|
||||
ascii_str = re.sub(r"\s+", " ", ascii_str)
|
||||
return ascii_str
|
||||
|
||||
@dataclass
|
||||
class Activity:
|
||||
@@ -19,10 +35,19 @@ class Activity:
|
||||
# Categories
|
||||
category: str = ""
|
||||
subcategory: Optional[str] = None
|
||||
|
||||
# content_type is an axis INDEPENDENT of category:
|
||||
# one of joc/activitate/reteta/cantec/ceremonie (see config_taxonomy).
|
||||
content_type: Optional[str] = None
|
||||
|
||||
# Source information
|
||||
source_file: str = ""
|
||||
page_reference: Optional[str] = None
|
||||
# source_files: JSON-encoded list of every source the activity was seen in.
|
||||
# `source_file` (singular) stays as the primary/original source; build_database
|
||||
# (Lane C) accumulates the full list here on dedup-merge.
|
||||
source_files: List[str] = field(default_factory=list)
|
||||
# Short verbatim quote from the source — anti-hallucination anchor.
|
||||
source_excerpt: Optional[str] = None
|
||||
|
||||
# Age and participants
|
||||
age_group_min: Optional[int] = None
|
||||
@@ -44,11 +69,22 @@ class Activity:
|
||||
keywords: Optional[str] = None
|
||||
tags: List[str] = field(default_factory=list)
|
||||
popularity_score: int = 0
|
||||
|
||||
|
||||
# Extraction / language metadata
|
||||
language: Optional[str] = None # 'ro' / 'en'
|
||||
normalized_name: Optional[str] = None # dedup key; auto-derived from name
|
||||
extraction_confidence: Optional[str] = None # 'high' / 'med' / 'low'
|
||||
needs_review: int = 0
|
||||
|
||||
# Database fields
|
||||
id: Optional[int] = None
|
||||
created_at: Optional[str] = None
|
||||
updated_at: Optional[str] = None
|
||||
|
||||
def __post_init__(self):
|
||||
"""Derive normalized_name from name when not explicitly provided."""
|
||||
if not self.normalized_name:
|
||||
self.normalized_name = normalize_name(self.name)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert activity to dictionary for database storage"""
|
||||
@@ -59,8 +95,11 @@ class Activity:
|
||||
'variations': self.variations,
|
||||
'category': self.category,
|
||||
'subcategory': self.subcategory,
|
||||
'content_type': self.content_type,
|
||||
'source_file': self.source_file,
|
||||
'source_files': json.dumps(self.source_files) if self.source_files else None,
|
||||
'page_reference': self.page_reference,
|
||||
'source_excerpt': self.source_excerpt,
|
||||
'age_group_min': self.age_group_min,
|
||||
'age_group_max': self.age_group_max,
|
||||
'participants_min': self.participants_min,
|
||||
@@ -73,7 +112,11 @@ class Activity:
|
||||
'difficulty_level': self.difficulty_level,
|
||||
'keywords': self.keywords,
|
||||
'tags': json.dumps(self.tags) if self.tags else None,
|
||||
'popularity_score': self.popularity_score
|
||||
'popularity_score': self.popularity_score,
|
||||
'language': self.language,
|
||||
'normalized_name': self.normalized_name or normalize_name(self.name),
|
||||
'extraction_confidence': self.extraction_confidence,
|
||||
'needs_review': self.needs_review,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -86,7 +129,17 @@ class Activity:
|
||||
tags = json.loads(data['tags'])
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
tags = []
|
||||
|
||||
|
||||
# source_files may arrive as a JSON string (DB) or a list (extraction)
|
||||
source_files = data.get('source_files')
|
||||
if isinstance(source_files, str):
|
||||
try:
|
||||
source_files = json.loads(source_files)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
source_files = []
|
||||
elif source_files is None:
|
||||
source_files = []
|
||||
|
||||
return cls(
|
||||
id=data.get('id'),
|
||||
name=data.get('name', ''),
|
||||
@@ -95,8 +148,11 @@ class Activity:
|
||||
variations=data.get('variations'),
|
||||
category=data.get('category', ''),
|
||||
subcategory=data.get('subcategory'),
|
||||
content_type=data.get('content_type'),
|
||||
source_file=data.get('source_file', ''),
|
||||
source_files=source_files,
|
||||
page_reference=data.get('page_reference'),
|
||||
source_excerpt=data.get('source_excerpt'),
|
||||
age_group_min=data.get('age_group_min'),
|
||||
age_group_max=data.get('age_group_max'),
|
||||
participants_min=data.get('participants_min'),
|
||||
@@ -110,6 +166,10 @@ class Activity:
|
||||
keywords=data.get('keywords'),
|
||||
tags=tags,
|
||||
popularity_score=data.get('popularity_score', 0),
|
||||
language=data.get('language'),
|
||||
normalized_name=data.get('normalized_name'),
|
||||
extraction_confidence=data.get('extraction_confidence'),
|
||||
needs_review=data.get('needs_review', 0) or 0,
|
||||
created_at=data.get('created_at'),
|
||||
updated_at=data.get('updated_at')
|
||||
)
|
||||
|
||||
@@ -30,6 +30,8 @@ class DatabaseManager:
|
||||
"""Initialize database with v2.0 schema"""
|
||||
with self._get_connection() as conn:
|
||||
# Main activities table
|
||||
# NOTE: schema is rebuilt from scratch (plan §6) — no in-place
|
||||
# migration. The old DB is deleted and recreated by build_database.
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS activities (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
@@ -39,9 +41,12 @@ class DatabaseManager:
|
||||
variations TEXT,
|
||||
category TEXT NOT NULL,
|
||||
subcategory TEXT,
|
||||
content_type TEXT,
|
||||
source_file TEXT NOT NULL,
|
||||
source_files TEXT,
|
||||
page_reference TEXT,
|
||||
|
||||
source_excerpt TEXT,
|
||||
|
||||
-- Structured parameters
|
||||
age_group_min INTEGER,
|
||||
age_group_max INTEGER,
|
||||
@@ -49,26 +54,34 @@ class DatabaseManager:
|
||||
participants_max INTEGER,
|
||||
duration_min INTEGER,
|
||||
duration_max INTEGER,
|
||||
|
||||
|
||||
-- Categories for filtering
|
||||
materials_category TEXT,
|
||||
materials_list TEXT,
|
||||
skills_developed TEXT,
|
||||
difficulty_level TEXT,
|
||||
|
||||
|
||||
-- Metadata
|
||||
keywords TEXT,
|
||||
tags TEXT,
|
||||
popularity_score INTEGER DEFAULT 0,
|
||||
|
||||
-- Extraction / language metadata
|
||||
language TEXT,
|
||||
normalized_name TEXT,
|
||||
extraction_confidence TEXT,
|
||||
needs_review INTEGER DEFAULT 0,
|
||||
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
|
||||
# FTS5 virtual table for search
|
||||
conn.execute("""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS activities_fts USING fts5(
|
||||
name, description, rules, variations, keywords,
|
||||
materials_list, skills_developed,
|
||||
content='activities',
|
||||
content_rowid='id'
|
||||
)
|
||||
@@ -92,6 +105,7 @@ class DatabaseManager:
|
||||
"CREATE INDEX IF NOT EXISTS idx_activities_age ON activities(age_group_min, age_group_max)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_activities_participants ON activities(participants_min, participants_max)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_activities_duration ON activities(duration_min, duration_max)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_activities_normalized_name ON activities(normalized_name)",
|
||||
"CREATE INDEX IF NOT EXISTS idx_categories_type ON categories(type)"
|
||||
]
|
||||
|
||||
@@ -102,24 +116,34 @@ class DatabaseManager:
|
||||
conn.execute("""
|
||||
CREATE TRIGGER IF NOT EXISTS activities_fts_insert AFTER INSERT ON activities
|
||||
BEGIN
|
||||
INSERT INTO activities_fts(rowid, name, description, rules, variations, keywords)
|
||||
VALUES (new.id, new.name, new.description, new.rules, new.variations, new.keywords);
|
||||
INSERT INTO activities_fts(rowid, name, description, rules, variations,
|
||||
keywords, materials_list, skills_developed)
|
||||
VALUES (new.id, new.name, new.description, new.rules, new.variations,
|
||||
new.keywords, new.materials_list, new.skills_developed);
|
||||
END
|
||||
""")
|
||||
|
||||
|
||||
conn.execute("""
|
||||
CREATE TRIGGER IF NOT EXISTS activities_fts_delete AFTER DELETE ON activities
|
||||
BEGIN
|
||||
DELETE FROM activities_fts WHERE rowid = old.id;
|
||||
INSERT INTO activities_fts(activities_fts, rowid, name, description, rules,
|
||||
variations, keywords, materials_list, skills_developed)
|
||||
VALUES ('delete', old.id, old.name, old.description, old.rules,
|
||||
old.variations, old.keywords, old.materials_list, old.skills_developed);
|
||||
END
|
||||
""")
|
||||
|
||||
|
||||
conn.execute("""
|
||||
CREATE TRIGGER IF NOT EXISTS activities_fts_update AFTER UPDATE ON activities
|
||||
BEGIN
|
||||
DELETE FROM activities_fts WHERE rowid = old.id;
|
||||
INSERT INTO activities_fts(rowid, name, description, rules, variations, keywords)
|
||||
VALUES (new.id, new.name, new.description, new.rules, new.variations, new.keywords);
|
||||
INSERT INTO activities_fts(activities_fts, rowid, name, description, rules,
|
||||
variations, keywords, materials_list, skills_developed)
|
||||
VALUES ('delete', old.id, old.name, old.description, old.rules,
|
||||
old.variations, old.keywords, old.materials_list, old.skills_developed);
|
||||
INSERT INTO activities_fts(rowid, name, description, rules, variations,
|
||||
keywords, materials_list, skills_developed)
|
||||
VALUES (new.id, new.name, new.description, new.rules, new.variations,
|
||||
new.keywords, new.materials_list, new.skills_developed);
|
||||
END
|
||||
""")
|
||||
|
||||
@@ -179,6 +203,8 @@ class DatabaseManager:
|
||||
"""Update category usage counts"""
|
||||
categories_to_update = [
|
||||
('category', activity.category),
|
||||
('content_type', activity.content_type),
|
||||
('language', activity.language),
|
||||
('age_group', activity.get_age_range_display()),
|
||||
('participants', activity.get_participants_display()),
|
||||
('duration', activity.get_duration_display()),
|
||||
@@ -332,8 +358,11 @@ class DatabaseManager:
|
||||
def clear_database(self):
|
||||
"""Clear all data from database"""
|
||||
with self._get_connection() as conn:
|
||||
# Deleting from activities fires the delete trigger, which removes
|
||||
# the matching FTS rows. The explicit 'delete-all' command then
|
||||
# guarantees the external-content FTS index is fully cleared.
|
||||
conn.execute("DELETE FROM activities")
|
||||
conn.execute("DELETE FROM activities_fts")
|
||||
conn.execute("INSERT INTO activities_fts(activities_fts) VALUES('delete-all')")
|
||||
conn.execute("DELETE FROM categories")
|
||||
conn.commit()
|
||||
|
||||
|
||||
@@ -2,8 +2,6 @@
|
||||
Services for INDEX-SISTEM-JOCURI v2.0
|
||||
"""
|
||||
|
||||
from .parser import IndexMasterParser
|
||||
from .indexer import ActivityIndexer
|
||||
from .search import SearchService
|
||||
|
||||
__all__ = ['IndexMasterParser', 'ActivityIndexer', 'SearchService']
|
||||
__all__ = ['SearchService']
|
||||
|
||||
@@ -1,248 +0,0 @@
|
||||
"""
|
||||
Activity indexer service for INDEX-SISTEM-JOCURI v2.0
|
||||
Coordinates parsing and database indexing
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any
|
||||
from pathlib import Path
|
||||
from app.models.database import DatabaseManager
|
||||
from app.models.activity import Activity
|
||||
from app.services.parser import IndexMasterParser
|
||||
import time
|
||||
|
||||
class ActivityIndexer:
|
||||
"""Service for indexing activities from INDEX_MASTER into database"""
|
||||
|
||||
def __init__(self, db_manager: DatabaseManager, index_master_path: str):
|
||||
"""Initialize indexer with database manager and INDEX_MASTER path"""
|
||||
self.db = db_manager
|
||||
self.parser = IndexMasterParser(index_master_path)
|
||||
self.indexing_stats = {}
|
||||
|
||||
def index_all_activities(self, clear_existing: bool = False) -> Dict[str, Any]:
|
||||
"""Index all activities from INDEX_MASTER into database"""
|
||||
|
||||
print("🚀 Starting activity indexing process...")
|
||||
start_time = time.time()
|
||||
|
||||
# Clear existing data if requested
|
||||
if clear_existing:
|
||||
print("🗑️ Clearing existing database...")
|
||||
self.db.clear_database()
|
||||
|
||||
# Parse activities from INDEX_MASTER
|
||||
print("📖 Parsing INDEX_MASTER file...")
|
||||
activities = self.parser.parse_all_categories()
|
||||
|
||||
if not activities:
|
||||
print("❌ No activities were parsed!")
|
||||
return {'success': False, 'error': 'No activities parsed'}
|
||||
|
||||
# Filter valid activities
|
||||
valid_activities = []
|
||||
for activity in activities:
|
||||
if self.parser.validate_activity_completeness(activity):
|
||||
valid_activities.append(activity)
|
||||
else:
|
||||
print(f"⚠️ Skipping incomplete activity: {activity.name[:50]}...")
|
||||
|
||||
print(f"✅ Validated {len(valid_activities)} activities out of {len(activities)} parsed")
|
||||
|
||||
if len(valid_activities) < 100:
|
||||
print(f"⚠️ Warning: Only {len(valid_activities)} valid activities found. Expected 500+")
|
||||
|
||||
# Bulk insert into database
|
||||
print("💾 Inserting activities into database...")
|
||||
try:
|
||||
inserted_count = self.db.bulk_insert_activities(valid_activities)
|
||||
|
||||
# Rebuild FTS index for optimal search performance
|
||||
print("🔍 Rebuilding search index...")
|
||||
self.db.rebuild_fts_index()
|
||||
|
||||
end_time = time.time()
|
||||
indexing_time = end_time - start_time
|
||||
|
||||
# Generate final statistics (with error handling)
|
||||
try:
|
||||
stats = self._generate_indexing_stats(valid_activities, indexing_time)
|
||||
stats['inserted_count'] = inserted_count
|
||||
stats['success'] = True
|
||||
except Exception as e:
|
||||
print(f"⚠️ Error generating statistics: {e}")
|
||||
stats = {
|
||||
'success': True,
|
||||
'inserted_count': inserted_count,
|
||||
'indexing_time_seconds': indexing_time,
|
||||
'error': f'Stats generation failed: {str(e)}'
|
||||
}
|
||||
|
||||
print(f"✅ Indexing complete! {inserted_count} activities indexed in {indexing_time:.2f}s")
|
||||
|
||||
# Verify database state (with error handling)
|
||||
try:
|
||||
db_stats = self.db.get_statistics()
|
||||
print(f"📊 Database now contains {db_stats['total_activities']} activities")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Error getting database statistics: {e}")
|
||||
print(f"📊 Database insertion completed, statistics unavailable")
|
||||
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error during database insertion: {e}")
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
def index_specific_category(self, category_code: str) -> Dict[str, Any]:
|
||||
"""Index activities from a specific category only"""
|
||||
|
||||
print(f"🎯 Indexing specific category: {category_code}")
|
||||
|
||||
# Load content and parse specific category
|
||||
if not self.parser.load_content():
|
||||
return {'success': False, 'error': 'Could not load INDEX_MASTER'}
|
||||
|
||||
category_name = self.parser.category_mapping.get(category_code)
|
||||
if not category_name:
|
||||
return {'success': False, 'error': f'Unknown category code: {category_code}'}
|
||||
|
||||
activities = self.parser.parse_category_section(category_code, category_name)
|
||||
|
||||
if not activities:
|
||||
return {'success': False, 'error': f'No activities found in category {category_code}'}
|
||||
|
||||
# Filter valid activities
|
||||
valid_activities = [a for a in activities if self.parser.validate_activity_completeness(a)]
|
||||
|
||||
try:
|
||||
inserted_count = self.db.bulk_insert_activities(valid_activities)
|
||||
return {
|
||||
'success': True,
|
||||
'category': category_name,
|
||||
'inserted_count': inserted_count,
|
||||
'total_parsed': len(activities),
|
||||
'valid_activities': len(valid_activities)
|
||||
}
|
||||
except Exception as e:
|
||||
return {'success': False, 'error': str(e)}
|
||||
|
||||
def _generate_indexing_stats(self, activities: List[Activity], indexing_time: float) -> Dict[str, Any]:
|
||||
"""Generate comprehensive indexing statistics"""
|
||||
|
||||
# Get parser statistics
|
||||
parser_stats = self.parser.get_parsing_statistics()
|
||||
|
||||
# Calculate additional metrics
|
||||
categories = {}
|
||||
age_ranges = {}
|
||||
durations = {}
|
||||
materials = {}
|
||||
|
||||
for activity in activities:
|
||||
# Category breakdown
|
||||
if activity.category in categories:
|
||||
categories[activity.category] += 1
|
||||
else:
|
||||
categories[activity.category] = 1
|
||||
|
||||
# Age range analysis (with safety check)
|
||||
try:
|
||||
age_key = activity.get_age_range_display() or "nespecificat"
|
||||
age_ranges[age_key] = age_ranges.get(age_key, 0) + 1
|
||||
except Exception as e:
|
||||
print(f"Warning: Error getting age range for activity {activity.name}: {e}")
|
||||
age_ranges["nespecificat"] = age_ranges.get("nespecificat", 0) + 1
|
||||
|
||||
# Duration analysis (with safety check)
|
||||
try:
|
||||
duration_key = activity.get_duration_display() or "nespecificat"
|
||||
durations[duration_key] = durations.get(duration_key, 0) + 1
|
||||
except Exception as e:
|
||||
print(f"Warning: Error getting duration for activity {activity.name}: {e}")
|
||||
durations["nespecificat"] = durations.get("nespecificat", 0) + 1
|
||||
|
||||
# Materials analysis (with safety check)
|
||||
try:
|
||||
materials_key = activity.get_materials_display() or "nespecificat"
|
||||
materials[materials_key] = materials.get(materials_key, 0) + 1
|
||||
except Exception as e:
|
||||
print(f"Warning: Error getting materials for activity {activity.name}: {e}")
|
||||
materials["nespecificat"] = materials.get("nespecificat", 0) + 1
|
||||
|
||||
return {
|
||||
'indexing_time_seconds': indexing_time,
|
||||
'parsing_stats': parser_stats,
|
||||
'distribution': {
|
||||
'categories': categories,
|
||||
'age_ranges': age_ranges,
|
||||
'durations': durations,
|
||||
'materials': materials
|
||||
},
|
||||
'quality_metrics': {
|
||||
'completion_rate': parser_stats.get('completion_rate', 0),
|
||||
'average_description_length': parser_stats.get('average_description_length', 0),
|
||||
'activities_with_metadata': sum(1 for a in activities if a.age_group_min or a.participants_min or a.duration_min)
|
||||
}
|
||||
}
|
||||
|
||||
def verify_indexing_quality(self) -> Dict[str, Any]:
|
||||
"""Verify the quality of indexed data"""
|
||||
|
||||
try:
|
||||
# Get database statistics
|
||||
db_stats = self.db.get_statistics()
|
||||
|
||||
# Check for minimum activity count
|
||||
total_activities = db_stats['total_activities']
|
||||
meets_minimum = total_activities >= 500
|
||||
|
||||
# Check category distribution
|
||||
categories = db_stats.get('categories', {})
|
||||
category_coverage = len(categories)
|
||||
|
||||
# Sample some activities to check quality
|
||||
sample_activities = self.db.search_activities(limit=10)
|
||||
|
||||
quality_issues = []
|
||||
for activity in sample_activities:
|
||||
if not activity.get('description') or len(activity['description']) < 10:
|
||||
quality_issues.append(f"Activity {activity.get('name', 'Unknown')} has insufficient description")
|
||||
|
||||
if not activity.get('category'):
|
||||
quality_issues.append(f"Activity {activity.get('name', 'Unknown')} missing category")
|
||||
|
||||
return {
|
||||
'total_activities': total_activities,
|
||||
'meets_minimum_requirement': meets_minimum,
|
||||
'minimum_target': 500,
|
||||
'category_coverage': category_coverage,
|
||||
'expected_categories': len(self.parser.category_mapping),
|
||||
'quality_issues': quality_issues,
|
||||
'quality_score': max(0, 100 - len(quality_issues) * 10),
|
||||
'database_stats': db_stats
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {'error': str(e), 'quality_score': 0}
|
||||
|
||||
def get_indexing_progress(self) -> Dict[str, Any]:
|
||||
"""Get current indexing progress and status"""
|
||||
try:
|
||||
db_stats = self.db.get_statistics()
|
||||
|
||||
# Calculate progress towards 500+ activities goal
|
||||
total_activities = db_stats['total_activities']
|
||||
target_activities = 500
|
||||
progress_percentage = min(100, (total_activities / target_activities) * 100)
|
||||
|
||||
return {
|
||||
'current_activities': total_activities,
|
||||
'target_activities': target_activities,
|
||||
'progress_percentage': progress_percentage,
|
||||
'status': 'completed' if total_activities >= target_activities else 'in_progress',
|
||||
'categories_indexed': list(db_stats.get('categories', {}).keys()),
|
||||
'database_size_mb': db_stats.get('database_size_bytes', 0) / (1024 * 1024)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {'error': str(e), 'status': 'error'}
|
||||
@@ -1,340 +0,0 @@
|
||||
"""
|
||||
Advanced parser for INDEX_MASTER_JOCURI_ACTIVITATI.md
|
||||
Extracts 500+ individual activities with full details
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from app.models.activity import Activity
|
||||
|
||||
class IndexMasterParser:
|
||||
"""Advanced parser for extracting real activities from INDEX_MASTER"""
|
||||
|
||||
def __init__(self, index_file_path: str):
|
||||
"""Initialize parser with INDEX_MASTER file path"""
|
||||
self.index_file_path = Path(index_file_path)
|
||||
self.content = ""
|
||||
self.activities = []
|
||||
|
||||
# Category mapping for main sections (exact match from file)
|
||||
self.category_mapping = {
|
||||
'[A]': 'JOCURI CERCETĂȘEȘTI ȘI SCOUT',
|
||||
'[B]': 'TEAM BUILDING ȘI COMUNICARE',
|
||||
'[C]': 'CAMPING ȘI ACTIVITĂȚI EXTERIOR',
|
||||
'[D]': 'ESCAPE ROOM ȘI PUZZLE-URI',
|
||||
'[E]': 'ORIENTARE ȘI BUSOLE',
|
||||
'[F]': 'PRIMUL AJUTOR ȘI SIGURANȚA',
|
||||
'[G]': 'ACTIVITĂȚI EDUCAȚIONALE',
|
||||
'[H]': 'RESURSE SPECIALE'
|
||||
}
|
||||
|
||||
def load_content(self) -> bool:
|
||||
"""Load and validate INDEX_MASTER content"""
|
||||
try:
|
||||
if not self.index_file_path.exists():
|
||||
print(f"❌ INDEX_MASTER file not found: {self.index_file_path}")
|
||||
return False
|
||||
|
||||
with open(self.index_file_path, 'r', encoding='utf-8') as f:
|
||||
self.content = f.read()
|
||||
|
||||
if len(self.content) < 1000: # Sanity check
|
||||
print(f"⚠️ INDEX_MASTER file seems too small: {len(self.content)} chars")
|
||||
return False
|
||||
|
||||
print(f"✅ Loaded INDEX_MASTER: {len(self.content)} characters")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error loading INDEX_MASTER: {e}")
|
||||
return False
|
||||
|
||||
def parse_all_categories(self) -> List[Activity]:
|
||||
"""Parse all categories and extract individual activities"""
|
||||
if not self.load_content():
|
||||
return []
|
||||
|
||||
print("🔍 Starting comprehensive parsing of INDEX_MASTER...")
|
||||
|
||||
# Parse each main category
|
||||
for category_code, category_name in self.category_mapping.items():
|
||||
print(f"\n📂 Processing category {category_code}: {category_name}")
|
||||
category_activities = self.parse_category_section(category_code, category_name)
|
||||
self.activities.extend(category_activities)
|
||||
print(f" ✅ Extracted {len(category_activities)} activities")
|
||||
|
||||
print(f"\n🎯 Total activities extracted: {len(self.activities)}")
|
||||
return self.activities
|
||||
|
||||
def parse_category_section(self, category_code: str, category_name: str) -> List[Activity]:
|
||||
"""Parse a specific category section"""
|
||||
activities = []
|
||||
|
||||
# Find the category section - exact pattern match
|
||||
# Look for the actual section, not the table of contents
|
||||
pattern = rf"^## {re.escape(category_code)} {re.escape(category_name)}\s*$"
|
||||
matches = list(re.finditer(pattern, self.content, re.MULTILINE | re.IGNORECASE))
|
||||
|
||||
if not matches:
|
||||
print(f" ⚠️ Category section not found: {category_code}")
|
||||
return activities
|
||||
|
||||
# Take the last match (should be the actual section, not TOC)
|
||||
match = matches[-1]
|
||||
print(f" 📍 Found section at position {match.start()}")
|
||||
|
||||
# Extract content until next main category or end
|
||||
start_pos = match.end()
|
||||
|
||||
# Find next main category (look for complete header)
|
||||
next_category_pattern = r"^## \[[A-H]\] [A-ZĂÂÎȘȚ]"
|
||||
next_match = re.search(next_category_pattern, self.content[start_pos:], re.MULTILINE)
|
||||
|
||||
if next_match:
|
||||
end_pos = start_pos + next_match.start()
|
||||
section_content = self.content[start_pos:end_pos]
|
||||
else:
|
||||
section_content = self.content[start_pos:]
|
||||
|
||||
# Parse subsections within the category
|
||||
activities.extend(self._parse_subsections(section_content, category_name))
|
||||
|
||||
return activities
|
||||
|
||||
def _parse_subsections(self, section_content: str, category_name: str) -> List[Activity]:
|
||||
"""Parse subsections within a category"""
|
||||
activities = []
|
||||
|
||||
# Find all subsections (### markers)
|
||||
subsection_pattern = r"^### (.+?)$"
|
||||
subsections = re.finditer(subsection_pattern, section_content, re.MULTILINE)
|
||||
|
||||
subsection_list = list(subsections)
|
||||
|
||||
for i, subsection in enumerate(subsection_list):
|
||||
subsection_title = subsection.group(1).strip()
|
||||
subsection_start = subsection.end()
|
||||
|
||||
# Find end of subsection
|
||||
if i + 1 < len(subsection_list):
|
||||
subsection_end = subsection_list[i + 1].start()
|
||||
else:
|
||||
subsection_end = len(section_content)
|
||||
|
||||
subsection_text = section_content[subsection_start:subsection_end]
|
||||
|
||||
# Parse individual games in this subsection
|
||||
subsection_activities = self._parse_games_in_subsection(
|
||||
subsection_text, category_name, subsection_title
|
||||
)
|
||||
activities.extend(subsection_activities)
|
||||
|
||||
return activities
|
||||
|
||||
def _parse_games_in_subsection(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]:
|
||||
"""Parse individual games within a subsection"""
|
||||
activities = []
|
||||
|
||||
# Look for "Exemple de jocuri:" sections
|
||||
examples_pattern = r"\*\*Exemple de jocuri:\*\*\s*\n(.*?)(?=\n\*\*|$)"
|
||||
examples_matches = re.finditer(examples_pattern, subsection_text, re.DOTALL)
|
||||
|
||||
for examples_match in examples_matches:
|
||||
examples_text = examples_match.group(1)
|
||||
|
||||
# Extract individual games (numbered list)
|
||||
game_pattern = r"^(\d+)\.\s*\*\*(.+?)\*\*\s*-\s*(.+?)$"
|
||||
games = re.finditer(game_pattern, examples_text, re.MULTILINE)
|
||||
|
||||
for game_match in games:
|
||||
game_number = game_match.group(1)
|
||||
game_name = game_match.group(2).strip()
|
||||
game_description = game_match.group(3).strip()
|
||||
|
||||
# Extract metadata from subsection
|
||||
metadata = self._extract_subsection_metadata(subsection_text)
|
||||
|
||||
# Create activity
|
||||
activity = Activity(
|
||||
name=game_name,
|
||||
description=game_description,
|
||||
category=category_name,
|
||||
subcategory=subsection_title,
|
||||
source_file=f"INDEX_MASTER_JOCURI_ACTIVITATI.md",
|
||||
page_reference=f"{category_name} > {subsection_title} > #{game_number}",
|
||||
**metadata
|
||||
)
|
||||
|
||||
activities.append(activity)
|
||||
|
||||
# Also extract from direct activity descriptions without "Exemple de jocuri"
|
||||
activities.extend(self._parse_direct_activities(subsection_text, category_name, subsection_title))
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_subsection_metadata(self, subsection_text: str) -> Dict:
|
||||
"""Extract metadata from subsection text"""
|
||||
metadata = {}
|
||||
|
||||
# Extract participants info
|
||||
participants_pattern = r"\*\*Participanți:\*\*\s*(.+?)(?:\n|\*\*)"
|
||||
participants_match = re.search(participants_pattern, subsection_text)
|
||||
if participants_match:
|
||||
participants_text = participants_match.group(1).strip()
|
||||
participants = self._parse_participants(participants_text)
|
||||
metadata.update(participants)
|
||||
|
||||
# Extract duration
|
||||
duration_pattern = r"\*\*Durata:\*\*\s*(.+?)(?:\n|\*\*)"
|
||||
duration_match = re.search(duration_pattern, subsection_text)
|
||||
if duration_match:
|
||||
duration_text = duration_match.group(1).strip()
|
||||
duration = self._parse_duration(duration_text)
|
||||
metadata.update(duration)
|
||||
|
||||
# Extract materials
|
||||
materials_pattern = r"\*\*Materiale:\*\*\s*(.+?)(?:\n|\*\*)"
|
||||
materials_match = re.search(materials_pattern, subsection_text)
|
||||
if materials_match:
|
||||
materials_text = materials_match.group(1).strip()
|
||||
metadata['materials_list'] = materials_text
|
||||
metadata['materials_category'] = self._categorize_materials(materials_text)
|
||||
|
||||
# Extract keywords
|
||||
keywords_pattern = r"\*\*Cuvinte cheie:\*\*\s*(.+?)(?:\n|\*\*)"
|
||||
keywords_match = re.search(keywords_pattern, subsection_text)
|
||||
if keywords_match:
|
||||
metadata['keywords'] = keywords_match.group(1).strip()
|
||||
|
||||
return metadata
|
||||
|
||||
def _parse_participants(self, participants_text: str) -> Dict:
|
||||
"""Parse participants information"""
|
||||
result = {}
|
||||
|
||||
# Look for number ranges like "8-30 copii" or "5-15 persoane"
|
||||
range_pattern = r"(\d+)-(\d+)"
|
||||
range_match = re.search(range_pattern, participants_text)
|
||||
|
||||
if range_match:
|
||||
result['participants_min'] = int(range_match.group(1))
|
||||
result['participants_max'] = int(range_match.group(2))
|
||||
else:
|
||||
# Look for single numbers
|
||||
number_pattern = r"(\d+)\+"
|
||||
number_match = re.search(number_pattern, participants_text)
|
||||
if number_match:
|
||||
result['participants_min'] = int(number_match.group(1))
|
||||
|
||||
# Extract age information
|
||||
age_pattern = r"(\d+)-(\d+)\s*ani"
|
||||
age_match = re.search(age_pattern, participants_text)
|
||||
if age_match:
|
||||
result['age_group_min'] = int(age_match.group(1))
|
||||
result['age_group_max'] = int(age_match.group(2))
|
||||
|
||||
return result
|
||||
|
||||
def _parse_duration(self, duration_text: str) -> Dict:
|
||||
"""Parse duration information"""
|
||||
result = {}
|
||||
|
||||
# Look for time ranges like "5-20 minute" or "15-30min"
|
||||
range_pattern = r"(\d+)-(\d+)\s*(?:minute|min)"
|
||||
range_match = re.search(range_pattern, duration_text)
|
||||
|
||||
if range_match:
|
||||
result['duration_min'] = int(range_match.group(1))
|
||||
result['duration_max'] = int(range_match.group(2))
|
||||
else:
|
||||
# Look for single duration
|
||||
single_pattern = r"(\d+)\+?\s*(?:minute|min)"
|
||||
single_match = re.search(single_pattern, duration_text)
|
||||
if single_match:
|
||||
result['duration_min'] = int(single_match.group(1))
|
||||
|
||||
return result
|
||||
|
||||
def _categorize_materials(self, materials_text: str) -> str:
|
||||
"""Categorize materials into simple categories"""
|
||||
materials_lower = materials_text.lower()
|
||||
|
||||
if any(word in materials_lower for word in ['fără', 'nu necesare', 'nimic', 'minime']):
|
||||
return 'Fără materiale'
|
||||
elif any(word in materials_lower for word in ['hârtie', 'creion', 'marker', 'simple']):
|
||||
return 'Materiale simple'
|
||||
elif any(word in materials_lower for word in ['computer', 'proiector', 'echipament', 'complexe']):
|
||||
return 'Materiale complexe'
|
||||
else:
|
||||
return 'Materiale variate'
|
||||
|
||||
def _parse_direct_activities(self, subsection_text: str, category_name: str, subsection_title: str) -> List[Activity]:
|
||||
"""Parse activities that are described directly without 'Exemple de jocuri' section"""
|
||||
activities = []
|
||||
|
||||
# Look for activity descriptions in sections that don't have "Exemple de jocuri"
|
||||
if "**Exemple de jocuri:**" not in subsection_text:
|
||||
# Try to extract from file descriptions
|
||||
file_pattern = r"\*\*Fișier:\*\*\s*`([^`]+)`.*?\*\*(.+?)\*\*"
|
||||
file_matches = re.finditer(file_pattern, subsection_text, re.DOTALL)
|
||||
|
||||
for file_match in file_matches:
|
||||
file_name = file_match.group(1)
|
||||
description_part = file_match.group(2)
|
||||
|
||||
# Create a general activity for this file
|
||||
activity = Activity(
|
||||
name=f"Activități din {file_name}",
|
||||
description=f"Colecție de activități din fișierul {file_name}. {description_part[:200]}...",
|
||||
category=category_name,
|
||||
subcategory=subsection_title,
|
||||
source_file=file_name,
|
||||
page_reference=f"{category_name} > {subsection_title}",
|
||||
**self._extract_subsection_metadata(subsection_text)
|
||||
)
|
||||
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def validate_activity_completeness(self, activity: Activity) -> bool:
|
||||
"""Validate that an activity has all necessary fields"""
|
||||
required_fields = ['name', 'description', 'category', 'source_file']
|
||||
|
||||
for field in required_fields:
|
||||
if not getattr(activity, field) or not getattr(activity, field).strip():
|
||||
return False
|
||||
|
||||
# Check minimum description length
|
||||
if len(activity.description) < 10:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_parsing_statistics(self) -> Dict:
|
||||
"""Get statistics about the parsing process"""
|
||||
if not self.activities:
|
||||
return {'total_activities': 0}
|
||||
|
||||
category_counts = {}
|
||||
valid_activities = 0
|
||||
|
||||
for activity in self.activities:
|
||||
# Count by category
|
||||
if activity.category in category_counts:
|
||||
category_counts[activity.category] += 1
|
||||
else:
|
||||
category_counts[activity.category] = 1
|
||||
|
||||
# Count valid activities
|
||||
if self.validate_activity_completeness(activity):
|
||||
valid_activities += 1
|
||||
|
||||
return {
|
||||
'total_activities': len(self.activities),
|
||||
'valid_activities': valid_activities,
|
||||
'completion_rate': (valid_activities / len(self.activities)) * 100 if self.activities else 0,
|
||||
'category_breakdown': category_counts,
|
||||
'average_description_length': sum(len(a.description) for a in self.activities) / len(self.activities) if self.activities else 0
|
||||
}
|
||||
@@ -5,8 +5,19 @@ Enhanced search with FTS5 and intelligent filtering
|
||||
|
||||
from typing import List, Dict, Any, Optional
|
||||
from app.models.database import DatabaseManager
|
||||
from app.config_taxonomy import NON_GAME_CONTENT_TYPES
|
||||
import re
|
||||
|
||||
# Category slugs that are themselves "non-game" — selecting one of these as a
|
||||
# category filter also lifts the default non-game content_type exclusion.
|
||||
NON_GAME_CATEGORIES = {"retete", "cantece-ceremonii"}
|
||||
|
||||
# When a Python-side post-filter is active the DB LIMIT is applied *before*
|
||||
# filtering, so we over-fetch to still satisfy the caller's `limit`.
|
||||
_OVERSCAN_FACTOR = 5
|
||||
_OVERSCAN_CAP = 2000
|
||||
|
||||
|
||||
class SearchService:
|
||||
"""Enhanced search service with intelligent query processing"""
|
||||
|
||||
@@ -24,22 +35,72 @@ class SearchService:
|
||||
|
||||
if filters is None:
|
||||
filters = {}
|
||||
|
||||
|
||||
# Process and normalize search text
|
||||
processed_search = self._process_search_text(search_text)
|
||||
|
||||
|
||||
# Map web filters to database fields
|
||||
db_filters = self._map_filters_to_db_fields(filters)
|
||||
|
||||
|
||||
# content_type and language are filtered in Python: the DB layer does
|
||||
# not expose them as query parameters. The DEFAULT search excludes the
|
||||
# non-game content types (rețete / cântece / ceremonii) — they surface
|
||||
# only when the user explicitly filters that content_type, or picks a
|
||||
# non-game category. See plan §6.
|
||||
content_type, exclude_non_game = self._resolve_content_type_filter(filters)
|
||||
language = (filters.get('language') or '').strip().lower() or None
|
||||
post_filtering = bool(content_type or exclude_non_game or language)
|
||||
|
||||
# Over-fetch when post-filtering so the final list can still reach `limit`.
|
||||
fetch_limit = min(limit * _OVERSCAN_FACTOR, _OVERSCAN_CAP) if post_filtering else limit
|
||||
|
||||
# Perform database search
|
||||
results = self.db.search_activities(
|
||||
search_text=processed_search,
|
||||
**db_filters,
|
||||
limit=limit
|
||||
limit=fetch_limit
|
||||
)
|
||||
|
||||
# Post-process results for relevance and ranking
|
||||
return self._post_process_results(results, processed_search, filters)
|
||||
|
||||
# Apply content_type / language post-filters
|
||||
results = self._apply_content_type_filter(results, content_type, exclude_non_game)
|
||||
if language:
|
||||
results = [r for r in results
|
||||
if (r.get('language') or '').strip().lower() == language]
|
||||
|
||||
# Post-process results for relevance and ranking, then honour `limit`
|
||||
results = self._post_process_results(results, processed_search, filters)
|
||||
return results[:limit]
|
||||
|
||||
def _resolve_content_type_filter(self, filters: Dict[str, str]):
|
||||
"""Determine the content_type post-filter.
|
||||
|
||||
Returns (explicit_content_type | None, exclude_non_game: bool):
|
||||
- an explicit `content_type` filter → that value, no exclusion;
|
||||
- a `category` filter on a non-game category → no exclusion;
|
||||
- otherwise → default search, exclude non-game content types.
|
||||
"""
|
||||
content_type = (filters.get('content_type') or '').strip()
|
||||
if content_type:
|
||||
return content_type, False
|
||||
category = (filters.get('category') or '').strip()
|
||||
if category in NON_GAME_CATEGORIES:
|
||||
return None, False
|
||||
return None, True
|
||||
|
||||
def _apply_content_type_filter(self,
|
||||
results: List[Dict[str, Any]],
|
||||
content_type: Optional[str],
|
||||
exclude_non_game: bool) -> List[Dict[str, Any]]:
|
||||
"""Filter results by content_type (explicit include vs default exclude)."""
|
||||
if content_type:
|
||||
return [r for r in results
|
||||
if (r.get('content_type') or '') == content_type]
|
||||
if exclude_non_game:
|
||||
# Rows with NULL/unknown content_type are kept — only the known
|
||||
# non-game types are dropped from the default search.
|
||||
return [r for r in results
|
||||
if (r.get('content_type') or '') not in NON_GAME_CONTENT_TYPES]
|
||||
return results
|
||||
|
||||
def _process_search_text(self, search_text: Optional[str]) -> Optional[str]:
|
||||
"""Process and enhance search text for better FTS5 results"""
|
||||
@@ -83,10 +144,16 @@ class SearchService:
|
||||
if not filter_value or not filter_value.strip():
|
||||
continue
|
||||
|
||||
# content_type / language are NOT database query params — they are
|
||||
# applied as Python post-filters in search_activities(). Skip them
|
||||
# here so they never reach DatabaseManager.search_activities().
|
||||
if filter_key in ('content_type', 'language'):
|
||||
continue
|
||||
|
||||
# Map filter types to database fields
|
||||
if filter_key == 'category':
|
||||
db_filters['category'] = filter_value
|
||||
|
||||
|
||||
elif filter_key == 'age_group':
|
||||
# Parse age range (e.g., "5-8 ani", "12+ ani")
|
||||
age_match = re.search(r'(\d+)(?:-(\d+))?\s*ani?', filter_value)
|
||||
@@ -177,21 +244,22 @@ class SearchService:
|
||||
boost_score = 0
|
||||
|
||||
# Check name matches (highest priority)
|
||||
name_lower = result.get('name', '').lower()
|
||||
# NB: use `or ''` — nullable columns come back as None, not ''.
|
||||
name_lower = (result.get('name') or '').lower()
|
||||
for term in search_terms:
|
||||
if term in name_lower:
|
||||
boost_score += 10
|
||||
if name_lower.startswith(term):
|
||||
boost_score += 5 # Extra boost for name starts with term
|
||||
|
||||
|
||||
# Check description matches
|
||||
desc_lower = result.get('description', '').lower()
|
||||
desc_lower = (result.get('description') or '').lower()
|
||||
for term in search_terms:
|
||||
if term in desc_lower:
|
||||
boost_score += 3
|
||||
|
||||
|
||||
# Check keywords matches
|
||||
keywords_lower = result.get('keywords', '').lower()
|
||||
keywords_lower = (result.get('keywords') or '').lower()
|
||||
for term in search_terms:
|
||||
if term in keywords_lower:
|
||||
boost_score += 5
|
||||
@@ -280,11 +348,14 @@ class SearchService:
|
||||
return []
|
||||
|
||||
try:
|
||||
# Search for activities that match the partial query
|
||||
# Search for activities that match the partial query.
|
||||
# Over-fetch then drop non-game content types so autocomplete
|
||||
# mirrors the default search (no rețete / cântece / ceremonii).
|
||||
results = self.db.search_activities(
|
||||
search_text=f'"{partial_query}"',
|
||||
limit=limit * 2
|
||||
limit=limit * 6
|
||||
)
|
||||
results = self._apply_content_type_filter(results, None, True)
|
||||
|
||||
suggestions = []
|
||||
seen = set()
|
||||
|
||||
@@ -15,7 +15,13 @@
|
||||
<header class="activity-detail-header">
|
||||
<div class="activity-title-section">
|
||||
<h1 class="activity-detail-title">{{ activity.name }}</h1>
|
||||
<span class="activity-category-badge">{{ activity.category }}</span>
|
||||
<span class="activity-category-badge">{{ display_names.get(activity.category, activity.category) }}</span>
|
||||
{% if activity.content_type %}
|
||||
<span class="activity-content-type-badge">{{ display_names.get(activity.content_type, activity.content_type) }}</span>
|
||||
{% endif %}
|
||||
{% if activity.needs_review %}
|
||||
<span class="activity-badge needs-review" title="Această activitate necesită verificare">⚠ De verificat</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
{% if activity.subcategory %}
|
||||
|
||||
@@ -36,7 +36,31 @@
|
||||
<select name="category" id="category" class="filter-select">
|
||||
<option value="">Toate categoriile</option>
|
||||
{% for category in filters.category %}
|
||||
<option value="{{ category }}">{{ category }}</option>
|
||||
<option value="{{ category }}">{{ display_names.get(category, category) }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if filters.content_type %}
|
||||
<div class="filter-group">
|
||||
<label for="content_type" class="filter-label">Tip conținut</label>
|
||||
<select name="content_type" id="content_type" class="filter-select">
|
||||
<option value="">Doar jocuri și activități</option>
|
||||
{% for content_type in filters.content_type %}
|
||||
<option value="{{ content_type }}">{{ display_names.get(content_type, content_type) }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if filters.language %}
|
||||
<div class="filter-group">
|
||||
<label for="language" class="filter-label">Limbă</label>
|
||||
<select name="language" id="language" class="filter-select">
|
||||
<option value="">Toate limbile</option>
|
||||
{% for language in filters.language %}
|
||||
<option value="{{ language }}">{{ display_names.get(language, language) }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</div>
|
||||
|
||||
@@ -24,7 +24,29 @@
|
||||
<option value="">Toate categoriile</option>
|
||||
{% for category in filters.category %}
|
||||
<option value="{{ category }}" {% if applied_filters.category == category %}selected{% endif %}>
|
||||
{{ category }}
|
||||
{{ display_names.get(category, category) }}
|
||||
</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
{% endif %}
|
||||
|
||||
{% if filters.content_type %}
|
||||
<select name="content_type" class="filter-select compact">
|
||||
<option value="">Doar jocuri și activități</option>
|
||||
{% for content_type in filters.content_type %}
|
||||
<option value="{{ content_type }}" {% if applied_filters.content_type == content_type %}selected{% endif %}>
|
||||
{{ display_names.get(content_type, content_type) }}
|
||||
</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
{% endif %}
|
||||
|
||||
{% if filters.language %}
|
||||
<select name="language" class="filter-select compact">
|
||||
<option value="">Toate limbile</option>
|
||||
{% for language in filters.language %}
|
||||
<option value="{{ language }}" {% if applied_filters.language == language %}selected{% endif %}>
|
||||
{{ display_names.get(language, language) }}
|
||||
</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
@@ -109,7 +131,10 @@
|
||||
{{ activity.name }}
|
||||
</a>
|
||||
</h3>
|
||||
<span class="activity-category">{{ activity.category }}</span>
|
||||
<span class="activity-category">{{ display_names.get(activity.category, activity.category) }}</span>
|
||||
{% if activity.needs_review %}
|
||||
<span class="activity-badge needs-review" title="Această activitate necesită verificare">⚠ De verificat</span>
|
||||
{% endif %}
|
||||
</header>
|
||||
|
||||
<div class="activity-content">
|
||||
|
||||
@@ -7,11 +7,17 @@ from flask import Blueprint, request, render_template, jsonify, current_app
|
||||
from app.models.database import DatabaseManager
|
||||
from app.models.activity import Activity
|
||||
from app.services.search import SearchService
|
||||
from app.config_taxonomy import CATEGORIES, CONTENT_TYPES
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
bp = Blueprint('main', __name__)
|
||||
|
||||
# Slug -> Romanian display name. Category and content_type slugs never collide,
|
||||
# so a single flat map is enough for the UI filter labels.
|
||||
LANGUAGE_NAMES = {'ro': 'Română', 'en': 'Engleză'}
|
||||
DISPLAY_NAMES = {**CATEGORIES, **CONTENT_TYPES, **LANGUAGE_NAMES}
|
||||
|
||||
# Initialize database manager (will be configured in application factory)
|
||||
def get_db_manager():
|
||||
"""Get database manager instance"""
|
||||
@@ -36,15 +42,17 @@ def index():
|
||||
# Get database statistics for the interface
|
||||
stats = db.get_statistics()
|
||||
|
||||
return render_template('index.html',
|
||||
return render_template('index.html',
|
||||
filters=filter_options,
|
||||
display_names=DISPLAY_NAMES,
|
||||
stats=stats)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading main page: {e}")
|
||||
# Fallback with empty filters
|
||||
return render_template('index.html',
|
||||
return render_template('index.html',
|
||||
filters={},
|
||||
display_names=DISPLAY_NAMES,
|
||||
stats={'total_activities': 0})
|
||||
|
||||
@bp.route('/search', methods=['GET', 'POST'])
|
||||
@@ -82,8 +90,9 @@ def search():
|
||||
search_query=search_query,
|
||||
applied_filters=filters,
|
||||
filters=filter_options,
|
||||
display_names=DISPLAY_NAMES,
|
||||
results_count=len(activities))
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Search error: {e}")
|
||||
return render_template('results.html',
|
||||
@@ -91,6 +100,7 @@ def search():
|
||||
search_query='',
|
||||
applied_filters={},
|
||||
filters={},
|
||||
display_names=DISPLAY_NAMES,
|
||||
results_count=0,
|
||||
error=str(e))
|
||||
|
||||
@@ -121,6 +131,7 @@ def activity_detail(activity_id):
|
||||
|
||||
return render_template('activity.html',
|
||||
activity=activity,
|
||||
display_names=DISPLAY_NAMES,
|
||||
similar_activities=similar_activities)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
81
scripts/SUBAGENT_PROMPT.md
Normal file
81
scripts/SUBAGENT_PROMPT.md
Normal file
@@ -0,0 +1,81 @@
|
||||
# SUBAGENT — Activity extraction
|
||||
|
||||
You are a subagent in the game-library extraction pipeline. You extract
|
||||
educational activities (games, team-building, scouting, recipes, songs,
|
||||
ceremonies) from one chunk of a source document into structured JSON.
|
||||
|
||||
## Your task
|
||||
|
||||
1. **Read ONLY the chunk you were assigned.** Do not read other chunks, other
|
||||
files, or the original document. The chunk is a `.txt` file with
|
||||
`--- PAGE N ---` markers.
|
||||
2. Identify **every distinct activity** in the chunk.
|
||||
3. For each activity, fill the schema in `scripts/activity_schema.json`.
|
||||
4. Write the result to `data/extracted/<chunk_key>.json`.
|
||||
|
||||
## What counts as "a distinct activity"
|
||||
|
||||
A distinct activity is a self-contained game/activity/recipe/song/ceremony with
|
||||
its own name and a real description of how to do it. It is NOT:
|
||||
|
||||
- a bare mention or a cross-reference with no description — **skip it**;
|
||||
- a sub-variant of an activity already extracted — fold it into `variations`;
|
||||
- a heading, a table of contents entry, or running page chrome.
|
||||
|
||||
If the same activity is split across a page boundary inside your chunk, treat it
|
||||
as **one** activity and combine the text.
|
||||
|
||||
## Output format
|
||||
|
||||
The file is one JSON object: a `header` plus an `activities` array.
|
||||
|
||||
```json
|
||||
{
|
||||
"header": {
|
||||
"source_id": "<set from the prompt>",
|
||||
"chunk_key": "<set from the prompt>",
|
||||
"source_hash": "<set from the prompt>",
|
||||
"schema_version": "1.0",
|
||||
"prompt_version": "1.0",
|
||||
"chunk_range": "pages 1-20"
|
||||
},
|
||||
"activities": [ ... ]
|
||||
}
|
||||
```
|
||||
|
||||
## Rules for each activity
|
||||
|
||||
- **`name`** — the activity's real name (≥3 characters).
|
||||
- **`description`** — real prose describing the activity. No hard length limit,
|
||||
but it must actually describe what happens.
|
||||
- **`rules`** — how it is played / carried out, if the source gives rules.
|
||||
- **`category`** — exactly one taxonomy slug (see the `enum` in the schema):
|
||||
`jocuri-cercetasesti`, `team-building`, `icebreakers`, `camp-outdoor`,
|
||||
`wide-games`, `orientare`, `prim-ajutor`, `escape-room-puzzle`,
|
||||
`creative-stem`, `sports-active`, `cantece-ceremonii`, `retete`,
|
||||
`supravietuire`, `integrare-incluziune`, `conflict-empatie`, `altele`.
|
||||
When unsure, use `altele`.
|
||||
- **`content_type`** — the FORM of the content, independent of category:
|
||||
`joc`, `activitate`, `reteta`, `cantec`, or `ceremonie`.
|
||||
- **`language`** — `ro` or `en` (the language the activity is written in).
|
||||
- **`source_excerpt`** — **MANDATORY.** A short quote (one or two sentences)
|
||||
copied **verbatim** from the chunk. This is the anti-hallucination anchor: it
|
||||
is checked as a fuzzy substring of the chunk, and invented quotes are
|
||||
rejected.
|
||||
- **`page_reference`** — **MANDATORY.** The `--- PAGE N ---` marker(s) the
|
||||
activity came from, e.g. `"page 14"` or `"pages 14-15"`.
|
||||
- **`extraction_confidence`** — `high`, `med`, or `low`. Use `low` when the
|
||||
source text for the activity is thin or ambiguous.
|
||||
|
||||
## Never invent data
|
||||
|
||||
- Do **not** invent ages, participant counts, or durations. If the source does
|
||||
not state them, leave those fields `null`.
|
||||
- Do **not** paraphrase the `source_excerpt` — copy it character for character.
|
||||
- Better to extract fewer activities accurately than to pad the output.
|
||||
|
||||
## Before you finish
|
||||
|
||||
- Every activity has a non-empty `source_excerpt` and `page_reference`.
|
||||
- The file validates against `scripts/activity_schema.json`.
|
||||
- You only used text from your assigned chunk.
|
||||
110
scripts/activity_schema.json
Normal file
110
scripts/activity_schema.json
Normal file
@@ -0,0 +1,110 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"title": "Game-library extraction output",
|
||||
"description": "One subagent output file: a header carrying provenance/version metadata plus the list of activities extracted from a single chunk.",
|
||||
"type": "object",
|
||||
"required": ["header", "activities"],
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"header": {
|
||||
"type": "object",
|
||||
"required": ["source_hash", "schema_version", "prompt_version", "chunk_range"],
|
||||
"additionalProperties": true,
|
||||
"properties": {
|
||||
"source_hash": {"type": "string", "minLength": 8},
|
||||
"schema_version": {"type": "string"},
|
||||
"prompt_version": {"type": "string"},
|
||||
"chunk_range": {"type": "string"},
|
||||
"source_id": {"type": ["string", "null"]},
|
||||
"chunk_key": {"type": ["string", "null"]}
|
||||
}
|
||||
},
|
||||
"activities": {
|
||||
"type": "array",
|
||||
"items": {"$ref": "#/definitions/activity"}
|
||||
}
|
||||
},
|
||||
"definitions": {
|
||||
"activity": {
|
||||
"type": "object",
|
||||
"required": [
|
||||
"name",
|
||||
"description",
|
||||
"category",
|
||||
"content_type",
|
||||
"language",
|
||||
"extraction_confidence",
|
||||
"source_excerpt",
|
||||
"page_reference"
|
||||
],
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"name": {"type": "string", "minLength": 3},
|
||||
"description": {"type": "string", "minLength": 1},
|
||||
"rules": {"type": ["string", "null"]},
|
||||
"variations": {"type": ["string", "null"]},
|
||||
"category": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"jocuri-cercetasesti",
|
||||
"team-building",
|
||||
"icebreakers",
|
||||
"camp-outdoor",
|
||||
"wide-games",
|
||||
"orientare",
|
||||
"prim-ajutor",
|
||||
"escape-room-puzzle",
|
||||
"creative-stem",
|
||||
"sports-active",
|
||||
"cantece-ceremonii",
|
||||
"retete",
|
||||
"supravietuire",
|
||||
"integrare-incluziune",
|
||||
"conflict-empatie",
|
||||
"altele"
|
||||
]
|
||||
},
|
||||
"subcategory": {"type": ["string", "null"]},
|
||||
"content_type": {
|
||||
"type": "string",
|
||||
"enum": ["joc", "activitate", "reteta", "cantec", "ceremonie"]
|
||||
},
|
||||
"language": {"type": "string", "enum": ["ro", "en"]},
|
||||
"extraction_confidence": {
|
||||
"type": "string",
|
||||
"enum": ["high", "med", "low"]
|
||||
},
|
||||
"source_excerpt": {"type": "string", "minLength": 1},
|
||||
"page_reference": {"type": "string", "minLength": 1},
|
||||
"source_file": {"type": ["string", "null"]},
|
||||
"age_group_min": {"type": ["integer", "null"], "minimum": 0},
|
||||
"age_group_max": {"type": ["integer", "null"], "minimum": 0},
|
||||
"participants_min": {"type": ["integer", "null"], "minimum": 0},
|
||||
"participants_max": {"type": ["integer", "null"], "minimum": 0},
|
||||
"duration_min": {"type": ["integer", "null"], "minimum": 0},
|
||||
"duration_max": {"type": ["integer", "null"], "minimum": 0},
|
||||
"materials_category": {"type": ["string", "null"]},
|
||||
"materials_list": {
|
||||
"type": ["array", "null"],
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"skills_developed": {
|
||||
"type": ["array", "null"],
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"difficulty_level": {
|
||||
"type": ["string", "null"],
|
||||
"enum": ["usor", "mediu", "dificil", null]
|
||||
},
|
||||
"keywords": {
|
||||
"type": ["array", "null"],
|
||||
"items": {"type": "string"}
|
||||
},
|
||||
"tags": {
|
||||
"type": ["array", "null"],
|
||||
"items": {"type": "string"}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
639
scripts/build_database.py
Normal file
639
scripts/build_database.py
Normal file
@@ -0,0 +1,639 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
build_database.py — build data/activities.db from the subagent extraction JSON.
|
||||
|
||||
Replaces the old import_claude_activities.py. Pipeline (plan §4):
|
||||
|
||||
1. `--rebuild` builds into data/activities.db.tmp; on success the live DB is
|
||||
backed up to data/activities.db.bak and the tmp file is swapped in with an
|
||||
atomic os.replace. A mid-build crash leaves the live DB untouched.
|
||||
2. Every data/extracted/*.json is validated against scripts/activity_schema.json;
|
||||
invalid files are moved to data/extracted/_rejected/ with an error log.
|
||||
2b. Each source_excerpt must appear as a fuzzy substring (rapidfuzz
|
||||
partial_ratio >= 90) of its source chunk — non-matches are hallucinations
|
||||
and the activity is dropped (logged to _rejected/).
|
||||
3. `category` is normalized to a valid taxonomy slug (fallback `altele`).
|
||||
4. Dedup (D5): group by exact normalized_name, never across languages; within a
|
||||
group rapidfuzz on descriptions — >=85 auto-merge, 60-85 borderline (keep
|
||||
both, needs_review), <60 separate variants.
|
||||
5. data/review_decisions.json is applied before insert.
|
||||
6. Bulk insert into the tmp DB, populate the categories table, rebuild FTS.
|
||||
7. A QA report is printed.
|
||||
|
||||
Usage:
|
||||
python scripts/build_database.py --rebuild
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
REPO_ROOT = SCRIPT_DIR.parent
|
||||
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
|
||||
if _p not in sys.path:
|
||||
sys.path.insert(0, _p)
|
||||
|
||||
from app.config_taxonomy import ( # noqa: E402
|
||||
category_display_name,
|
||||
normalize_category,
|
||||
normalize_content_type,
|
||||
)
|
||||
from app.models.activity import Activity # noqa: E402
|
||||
from app.models.database import DatabaseManager # noqa: E402
|
||||
from import_common import ( # noqa: E402
|
||||
DEFAULT_SCHEMA_PATH,
|
||||
content_key,
|
||||
excerpt_matches,
|
||||
find_chunk_text,
|
||||
iter_extraction_files,
|
||||
load_schema,
|
||||
normalize_name,
|
||||
source_path_for,
|
||||
)
|
||||
|
||||
# dedup thresholds (rapidfuzz token_sort_ratio, 0..100 scale)
|
||||
AUTO_MERGE_THRESHOLD = 85.0
|
||||
BORDERLINE_THRESHOLD = 60.0
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# extraction dict -> Activity
|
||||
# --------------------------------------------------------------------------
|
||||
def _csv(value: Any) -> Optional[str]:
|
||||
"""Schema arrays -> comma string for the (TEXT) DB columns."""
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, str):
|
||||
return value.strip() or None
|
||||
if isinstance(value, (list, tuple)):
|
||||
parts = [str(v).strip() for v in value if str(v).strip()]
|
||||
return ", ".join(parts) or None
|
||||
return str(value)
|
||||
|
||||
|
||||
def _split_csv(value: Optional[str]) -> list[str]:
|
||||
if not value:
|
||||
return []
|
||||
return [p.strip() for p in str(value).split(",") if p.strip()]
|
||||
|
||||
|
||||
def dict_to_activity(adict: dict, source_file: str) -> Activity:
|
||||
"""Build an Activity from one extraction-JSON activity object."""
|
||||
tags = adict.get("tags") or []
|
||||
if isinstance(tags, str):
|
||||
tags = _split_csv(tags)
|
||||
|
||||
source_files = adict.get("source_files") or []
|
||||
if isinstance(source_files, str):
|
||||
source_files = _split_csv(source_files)
|
||||
if source_file and source_file not in source_files:
|
||||
source_files = [source_file, *source_files]
|
||||
|
||||
return Activity(
|
||||
name=(adict.get("name") or "").strip(),
|
||||
description=(adict.get("description") or "").strip(),
|
||||
rules=adict.get("rules"),
|
||||
variations=adict.get("variations"),
|
||||
category=normalize_category(adict.get("category", "")),
|
||||
subcategory=adict.get("subcategory"),
|
||||
content_type=normalize_content_type(adict.get("content_type", "")),
|
||||
source_file=source_file,
|
||||
source_files=list(source_files),
|
||||
page_reference=adict.get("page_reference"),
|
||||
source_excerpt=adict.get("source_excerpt"),
|
||||
age_group_min=adict.get("age_group_min"),
|
||||
age_group_max=adict.get("age_group_max"),
|
||||
participants_min=adict.get("participants_min"),
|
||||
participants_max=adict.get("participants_max"),
|
||||
duration_min=adict.get("duration_min"),
|
||||
duration_max=adict.get("duration_max"),
|
||||
materials_category=adict.get("materials_category"),
|
||||
materials_list=_csv(adict.get("materials_list")),
|
||||
skills_developed=_csv(adict.get("skills_developed")),
|
||||
difficulty_level=adict.get("difficulty_level"),
|
||||
keywords=_csv(adict.get("keywords")),
|
||||
tags=list(tags),
|
||||
language=adict.get("language"),
|
||||
extraction_confidence=adict.get("extraction_confidence"),
|
||||
)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# step 3 — category normalization is done in dict_to_activity; a non-taxonomy
|
||||
# value silently falls back to `altele`. This logs the substitutions.
|
||||
# --------------------------------------------------------------------------
|
||||
def log_category_fallbacks(raw_pairs: list[tuple[str, str]]) -> list[str]:
|
||||
"""raw_pairs = (original, slug); return human-readable fallback messages."""
|
||||
msgs = []
|
||||
for original, slug in raw_pairs:
|
||||
if slug == "altele" and normalize_name(original or "") not in ("", "altele"):
|
||||
msgs.append(f"category '{original}' -> altele (not in taxonomy)")
|
||||
return msgs
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# step 4 — dedup
|
||||
# --------------------------------------------------------------------------
|
||||
def _longest(*values: Optional[str]) -> Optional[str]:
|
||||
best: Optional[str] = None
|
||||
for v in values:
|
||||
if v and (best is None or len(v) > len(best)):
|
||||
best = v
|
||||
return best
|
||||
|
||||
|
||||
def _union_csv(values: list[Optional[str]]) -> Optional[str]:
|
||||
seen: list[str] = []
|
||||
for value in values:
|
||||
for item in _split_csv(value):
|
||||
if item not in seen:
|
||||
seen.append(item)
|
||||
return ", ".join(seen) or None
|
||||
|
||||
|
||||
def merge_cluster(cluster: list[Activity]) -> Activity:
|
||||
"""Collapse a cluster of duplicate activities into one merged Activity."""
|
||||
if len(cluster) == 1:
|
||||
return cluster[0]
|
||||
|
||||
# representative = the one with the longest description
|
||||
rep = max(cluster, key=lambda a: len(a.description or ""))
|
||||
merged = Activity(
|
||||
name=rep.name,
|
||||
description=_longest(*(a.description for a in cluster)) or rep.description,
|
||||
rules=_longest(*(a.rules for a in cluster)),
|
||||
variations=_longest(*(a.variations for a in cluster)),
|
||||
category=rep.category,
|
||||
subcategory=rep.subcategory,
|
||||
content_type=rep.content_type,
|
||||
source_file=rep.source_file,
|
||||
page_reference=rep.page_reference,
|
||||
source_excerpt=rep.source_excerpt,
|
||||
age_group_min=rep.age_group_min,
|
||||
age_group_max=rep.age_group_max,
|
||||
participants_min=rep.participants_min,
|
||||
participants_max=rep.participants_max,
|
||||
duration_min=rep.duration_min,
|
||||
duration_max=rep.duration_max,
|
||||
materials_category=rep.materials_category,
|
||||
materials_list=_union_csv([a.materials_list for a in cluster]),
|
||||
skills_developed=_union_csv([a.skills_developed for a in cluster]),
|
||||
difficulty_level=rep.difficulty_level,
|
||||
keywords=_union_csv([a.keywords for a in cluster]),
|
||||
language=rep.language,
|
||||
extraction_confidence=rep.extraction_confidence,
|
||||
)
|
||||
# union of tags
|
||||
tags: list[str] = []
|
||||
for a in cluster:
|
||||
for t in a.tags or []:
|
||||
if t not in tags:
|
||||
tags.append(t)
|
||||
merged.tags = tags
|
||||
# accumulate every source the activity was seen in
|
||||
sources: list[str] = []
|
||||
for a in cluster:
|
||||
for s in [a.source_file, *(a.source_files or [])]:
|
||||
if s and s not in sources:
|
||||
sources.append(s)
|
||||
merged.source_files = sources
|
||||
# popularity_score++ per merged duplicate (plan §4)
|
||||
merged.popularity_score = max(a.popularity_score for a in cluster) + (len(cluster) - 1)
|
||||
return merged
|
||||
|
||||
|
||||
def dedup_activities(activities: list[Activity]) -> tuple[list[Activity], dict]:
|
||||
"""
|
||||
Dedup per plan D5.
|
||||
|
||||
Groups by (normalized_name, language) — different languages are NEVER
|
||||
merged. Within a group, descriptions are clustered with rapidfuzz:
|
||||
>= 85 -> same cluster (auto-merge)
|
||||
60-85 -> borderline: kept as separate clusters, both flagged needs_review
|
||||
< 60 -> separate variants
|
||||
"""
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
groups: dict[tuple, list[Activity]] = defaultdict(list)
|
||||
for act in activities:
|
||||
key = (act.normalized_name or normalize_name(act.name), act.language)
|
||||
groups[key].append(act)
|
||||
|
||||
result: list[Activity] = []
|
||||
stats = {"input": len(activities), "auto_merged": 0, "borderline": 0, "output": 0}
|
||||
|
||||
for members in groups.values():
|
||||
clusters: list[list[Activity]] = []
|
||||
borderline_idx: set[int] = set()
|
||||
|
||||
for act in members:
|
||||
best_idx, best_score = -1, -1.0
|
||||
borderline_here: list[int] = []
|
||||
for idx, cluster in enumerate(clusters):
|
||||
score = fuzz.token_sort_ratio(
|
||||
act.description or "", cluster[0].description or ""
|
||||
)
|
||||
if score >= AUTO_MERGE_THRESHOLD:
|
||||
if score > best_score:
|
||||
best_idx, best_score = idx, score
|
||||
elif score >= BORDERLINE_THRESHOLD:
|
||||
borderline_here.append(idx)
|
||||
if best_idx >= 0:
|
||||
clusters[best_idx].append(act)
|
||||
else:
|
||||
clusters.append([act])
|
||||
new_idx = len(clusters) - 1
|
||||
for bidx in borderline_here:
|
||||
borderline_idx.add(bidx)
|
||||
borderline_idx.add(new_idx)
|
||||
|
||||
for idx, cluster in enumerate(clusters):
|
||||
merged = merge_cluster(cluster)
|
||||
if len(cluster) > 1:
|
||||
stats["auto_merged"] += len(cluster) - 1
|
||||
if idx in borderline_idx:
|
||||
merged.needs_review = 1
|
||||
stats["borderline"] += 1
|
||||
result.append(merged)
|
||||
|
||||
stats["output"] = len(result)
|
||||
return result, stats
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# step 5 — review decisions
|
||||
# --------------------------------------------------------------------------
|
||||
def load_review_decisions(path: Path) -> dict:
|
||||
if path and path.is_file():
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def apply_review_decisions(
|
||||
activities: list[Activity], decisions: dict
|
||||
) -> tuple[list[Activity], dict]:
|
||||
"""
|
||||
Apply data/review_decisions.json (plan §5c).
|
||||
|
||||
Keyed by the stable content_key. A decision of `drop` removes the row;
|
||||
`keep-separate` / `merge` clear needs_review (the user has resolved it).
|
||||
Rows with no decision keep needs_review and resurface in the queue.
|
||||
"""
|
||||
kept: list[Activity] = []
|
||||
stats = {"dropped": 0, "resolved": 0}
|
||||
for act in activities:
|
||||
key = content_key(
|
||||
act.normalized_name or normalize_name(act.name),
|
||||
act.language,
|
||||
act.description or "",
|
||||
)
|
||||
entry = decisions.get(key)
|
||||
decision = entry.get("decision") if isinstance(entry, dict) else entry
|
||||
if decision == "drop":
|
||||
stats["dropped"] += 1
|
||||
continue
|
||||
if decision in ("keep-separate", "merge"):
|
||||
act.needs_review = 0
|
||||
stats["resolved"] += 1
|
||||
kept.append(act)
|
||||
return kept, stats
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# golden-set recall (plan §7)
|
||||
# --------------------------------------------------------------------------
|
||||
def _golden_names(data: Any) -> list[str]:
|
||||
items = data.get("activities", data) if isinstance(data, dict) else data
|
||||
names: list[str] = []
|
||||
for item in items or []:
|
||||
if isinstance(item, str):
|
||||
names.append(item)
|
||||
elif isinstance(item, dict) and item.get("name"):
|
||||
names.append(item["name"])
|
||||
return names
|
||||
|
||||
|
||||
def golden_recall(golden_dir: Path, activities: list[Activity]) -> Optional[dict]:
|
||||
if not golden_dir or not golden_dir.is_dir():
|
||||
return None
|
||||
found = {normalize_name(a.name) for a in activities}
|
||||
expected, hits = 0, 0
|
||||
for gf in sorted(golden_dir.glob("*.json")):
|
||||
try:
|
||||
data = json.loads(gf.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
continue
|
||||
for name in _golden_names(data):
|
||||
expected += 1
|
||||
if normalize_name(name) in found:
|
||||
hits += 1
|
||||
if expected == 0:
|
||||
return None
|
||||
return {"expected": expected, "found": hits, "recall": round(hits / expected, 3)}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# load + validate + excerpt-check the extraction files
|
||||
# --------------------------------------------------------------------------
|
||||
def collect_activities(
|
||||
extracted_dir: Path,
|
||||
chunks_dir: Path,
|
||||
sources_dir: Path,
|
||||
schema: dict,
|
||||
) -> dict:
|
||||
"""Validate, excerpt-check and convert every extraction file."""
|
||||
rejected_dir = extracted_dir / "_rejected"
|
||||
activities: list[Activity] = []
|
||||
report = {
|
||||
"files_total": 0,
|
||||
"files_valid": 0,
|
||||
"files_rejected_schema": 0,
|
||||
"activities_raw": 0,
|
||||
"activities_hallucinated": 0,
|
||||
"category_fallbacks": [],
|
||||
}
|
||||
raw_categories: list[tuple[str, str]] = []
|
||||
|
||||
from import_common import chunk_key_for # local import to avoid clutter
|
||||
|
||||
for json_path in iter_extraction_files(extracted_dir):
|
||||
report["files_total"] += 1
|
||||
try:
|
||||
data = json.loads(json_path.read_text(encoding="utf-8"))
|
||||
except json.JSONDecodeError as exc:
|
||||
_reject_file(json_path, rejected_dir, [f"invalid JSON: {exc}"])
|
||||
report["files_rejected_schema"] += 1
|
||||
continue
|
||||
|
||||
from import_common import validate_extraction
|
||||
|
||||
errors = validate_extraction(data, schema)
|
||||
if errors:
|
||||
_reject_file(json_path, rejected_dir, errors)
|
||||
report["files_rejected_schema"] += 1
|
||||
continue
|
||||
report["files_valid"] += 1
|
||||
|
||||
header = data.get("header", {})
|
||||
chunk_text = find_chunk_text(json_path, header, chunks_dir)
|
||||
source_id = header.get("source_id") or chunk_key_for(json_path, header).rsplit(
|
||||
".part", 1
|
||||
)[0]
|
||||
fallback_source = (
|
||||
source_path_for(source_id, sources_dir) or source_id or json_path.stem
|
||||
)
|
||||
|
||||
hallucinated: list[dict] = []
|
||||
for adict in data.get("activities", []):
|
||||
report["activities_raw"] += 1
|
||||
excerpt = adict.get("source_excerpt") or ""
|
||||
# if the chunk text is unavailable we cannot verify — keep but the
|
||||
# QA report still counts it under activities_raw.
|
||||
if chunk_text is not None and not excerpt_matches(excerpt, chunk_text):
|
||||
hallucinated.append(adict)
|
||||
report["activities_hallucinated"] += 1
|
||||
continue
|
||||
src = adict.get("source_file") or fallback_source
|
||||
raw_categories.append((adict.get("category", ""), normalize_category(adict.get("category", ""))))
|
||||
activities.append(dict_to_activity(adict, src))
|
||||
|
||||
if hallucinated:
|
||||
_log_hallucinations(json_path, rejected_dir, hallucinated)
|
||||
|
||||
report["category_fallbacks"] = log_category_fallbacks(raw_categories)
|
||||
report["activities"] = activities
|
||||
return report
|
||||
|
||||
|
||||
def _reject_file(json_path: Path, rejected_dir: Path, errors: list[str]) -> None:
|
||||
rejected_dir.mkdir(parents=True, exist_ok=True)
|
||||
dest = rejected_dir / json_path.name
|
||||
shutil.move(str(json_path), str(dest))
|
||||
log = rejected_dir / f"{json_path.stem}.errors.txt"
|
||||
log.write_text(
|
||||
f"REJECTED (schema validation): {json_path.name}\n\n"
|
||||
+ "\n".join(f" - {e}" for e in errors)
|
||||
+ "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
def _log_hallucinations(
|
||||
json_path: Path, rejected_dir: Path, hallucinated: list[dict]
|
||||
) -> None:
|
||||
rejected_dir.mkdir(parents=True, exist_ok=True)
|
||||
log = rejected_dir / f"{json_path.stem}.hallucinations.txt"
|
||||
lines = [f"DROPPED activities (source_excerpt not found in chunk): {json_path.name}", ""]
|
||||
for a in hallucinated:
|
||||
lines.append(f" - {a.get('name')!r}")
|
||||
lines.append(f" excerpt: {a.get('source_excerpt')!r}")
|
||||
log.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# DB write + atomic swap
|
||||
# --------------------------------------------------------------------------
|
||||
def _enrich_category_display_names(db_path: Path) -> None:
|
||||
"""Give the categories table proper Romanian display names for slugs."""
|
||||
import sqlite3
|
||||
|
||||
conn = sqlite3.connect(db_path)
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"SELECT value FROM categories WHERE type = 'category'"
|
||||
).fetchall()
|
||||
for (slug,) in rows:
|
||||
conn.execute(
|
||||
"UPDATE categories SET display_name = ? WHERE type='category' AND value = ?",
|
||||
(category_display_name(slug), slug),
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def write_database(db_tmp_path: Path, activities: list[Activity]) -> None:
|
||||
"""Create a fresh tmp DB, bulk insert, populate categories, rebuild FTS."""
|
||||
if db_tmp_path.exists():
|
||||
db_tmp_path.unlink()
|
||||
db = DatabaseManager(str(db_tmp_path))
|
||||
db.bulk_insert_activities(activities)
|
||||
_enrich_category_display_names(db_tmp_path)
|
||||
db.rebuild_fts_index()
|
||||
|
||||
|
||||
def atomic_swap(db_tmp_path: Path, db_path: Path) -> Optional[Path]:
|
||||
"""Back up the live DB then atomically swap the tmp file in."""
|
||||
backup: Optional[Path] = None
|
||||
if db_path.exists():
|
||||
backup = db_path.with_suffix(db_path.suffix + ".bak")
|
||||
shutil.copy2(db_path, backup)
|
||||
os.replace(db_tmp_path, db_path)
|
||||
return backup
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# orchestration
|
||||
# --------------------------------------------------------------------------
|
||||
def rebuild(
|
||||
*,
|
||||
extracted_dir: Path,
|
||||
chunks_dir: Path,
|
||||
sources_dir: Path,
|
||||
db_path: Path,
|
||||
decisions_path: Optional[Path] = None,
|
||||
schema_path: Path = DEFAULT_SCHEMA_PATH,
|
||||
golden_dir: Optional[Path] = None,
|
||||
do_swap: bool = True,
|
||||
) -> dict:
|
||||
"""
|
||||
Full rebuild. Everything is built into <db_path>.tmp; the live DB is only
|
||||
touched by the final atomic swap, so a crash anywhere above leaves it intact.
|
||||
"""
|
||||
extracted_dir = Path(extracted_dir)
|
||||
db_path = Path(db_path)
|
||||
db_tmp_path = db_path.with_suffix(db_path.suffix + ".tmp")
|
||||
|
||||
schema = load_schema(schema_path)
|
||||
collected = collect_activities(extracted_dir, Path(chunks_dir), Path(sources_dir), schema)
|
||||
activities: list[Activity] = collected.pop("activities")
|
||||
|
||||
deduped, dedup_stats = dedup_activities(activities)
|
||||
|
||||
decisions = load_review_decisions(Path(decisions_path)) if decisions_path else {}
|
||||
final, decision_stats = apply_review_decisions(deduped, decisions)
|
||||
|
||||
try:
|
||||
write_database(db_tmp_path, final)
|
||||
backup = atomic_swap(db_tmp_path, db_path) if do_swap else None
|
||||
except Exception:
|
||||
if db_tmp_path.exists():
|
||||
db_tmp_path.unlink()
|
||||
raise
|
||||
|
||||
report = {
|
||||
**collected,
|
||||
"dedup": dedup_stats,
|
||||
"decisions": decision_stats,
|
||||
"final_count": len(final),
|
||||
"backup": str(backup) if backup else None,
|
||||
"swapped": do_swap,
|
||||
"qa": _qa_report(final, collected, golden_dir),
|
||||
}
|
||||
return report
|
||||
|
||||
|
||||
def _qa_report(
|
||||
activities: list[Activity], collected: dict, golden_dir: Optional[Path]
|
||||
) -> dict:
|
||||
per_category: dict[str, int] = defaultdict(int)
|
||||
per_content_type: dict[str, int] = defaultdict(int)
|
||||
confidence: dict[str, int] = defaultdict(int)
|
||||
with_rules = 0
|
||||
for a in activities:
|
||||
per_category[a.category] += 1
|
||||
per_content_type[a.content_type or "?"] += 1
|
||||
confidence[a.extraction_confidence or "?"] += 1
|
||||
if a.rules and a.rules.strip():
|
||||
with_rules += 1
|
||||
raw = collected.get("activities_raw", 0)
|
||||
hallucinated = collected.get("activities_hallucinated", 0)
|
||||
return {
|
||||
"total": len(activities),
|
||||
"per_category": dict(per_category),
|
||||
"per_content_type": dict(per_content_type),
|
||||
"extraction_confidence": dict(confidence),
|
||||
"pct_with_rules": round(100 * with_rules / len(activities), 1) if activities else 0.0,
|
||||
"needs_review": sum(1 for a in activities if a.needs_review),
|
||||
"hallucination_rate": round(100 * hallucinated / raw, 2) if raw else 0.0,
|
||||
"golden_recall": golden_recall(Path(golden_dir), activities) if golden_dir else None,
|
||||
}
|
||||
|
||||
|
||||
def print_report(report: dict) -> None:
|
||||
qa = report["qa"]
|
||||
print("=" * 60)
|
||||
print("BUILD DATABASE — QA REPORT")
|
||||
print("=" * 60)
|
||||
print(f"extraction files : {report['files_total']} "
|
||||
f"(valid {report['files_valid']}, schema-rejected {report['files_rejected_schema']})")
|
||||
print(f"activities raw : {report['activities_raw']}")
|
||||
print(f" hallucinated drop : {report['activities_hallucinated']} "
|
||||
f"({qa['hallucination_rate']}%)")
|
||||
d = report["dedup"]
|
||||
print(f"dedup : {d['input']} -> {d['output']} "
|
||||
f"(auto-merged {d['auto_merged']}, borderline {d['borderline']})")
|
||||
print(f"review decisions : dropped {report['decisions']['dropped']}, "
|
||||
f"resolved {report['decisions']['resolved']}")
|
||||
print(f"final inserted : {report['final_count']}")
|
||||
print(f"% with rules : {qa['pct_with_rules']}")
|
||||
print(f"needs_review rows : {qa['needs_review']}")
|
||||
print("per category :")
|
||||
for slug, n in sorted(qa["per_category"].items(), key=lambda kv: -kv[1]):
|
||||
print(f" {slug:<24}: {n}")
|
||||
print("per content_type :")
|
||||
for ct, n in sorted(qa["per_content_type"].items(), key=lambda kv: -kv[1]):
|
||||
print(f" {ct:<24}: {n}")
|
||||
print("extraction_confidence:")
|
||||
for c, n in sorted(qa["extraction_confidence"].items()):
|
||||
print(f" {c:<24}: {n}")
|
||||
if qa["golden_recall"]:
|
||||
g = qa["golden_recall"]
|
||||
print(f"golden recall : {g['found']}/{g['expected']} = {g['recall']}")
|
||||
if report["category_fallbacks"]:
|
||||
print("category fallbacks :")
|
||||
for msg in report["category_fallbacks"]:
|
||||
print(f" {msg}")
|
||||
if report["backup"]:
|
||||
print(f"live DB backed up to : {report['backup']}")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# CLI
|
||||
# --------------------------------------------------------------------------
|
||||
def main(argv: Optional[list[str]] = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Build activities.db from extraction JSON.")
|
||||
parser.add_argument("--rebuild", action="store_true",
|
||||
help="rebuild the database from scratch (only mode supported)")
|
||||
parser.add_argument("--extracted", default="data/extracted")
|
||||
parser.add_argument("--chunks", default="data/chunks")
|
||||
parser.add_argument("--sources", default="data/sources")
|
||||
parser.add_argument("--db", default="data/activities.db")
|
||||
parser.add_argument("--decisions", default="data/review_decisions.json")
|
||||
parser.add_argument("--golden", default="data/golden")
|
||||
parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH))
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if not args.rebuild:
|
||||
parser.error("only --rebuild is supported (full rebuild, no incremental merge)")
|
||||
|
||||
report = rebuild(
|
||||
extracted_dir=Path(args.extracted),
|
||||
chunks_dir=Path(args.chunks),
|
||||
sources_dir=Path(args.sources),
|
||||
db_path=Path(args.db),
|
||||
decisions_path=Path(args.decisions),
|
||||
schema_path=Path(args.schema),
|
||||
golden_dir=Path(args.golden),
|
||||
)
|
||||
print_report(report)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
251
scripts/chunk_sources.py
Normal file
251
scripts/chunk_sources.py
Normal file
@@ -0,0 +1,251 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
chunk_sources.py — split normalized data/sources/*.txt into ~20-page chunks
|
||||
for subagent extraction, and maintain data/chunks/manifest.json.
|
||||
|
||||
Paginated text → ~20-page chunks, ~4-page overlap (plan D8).
|
||||
Unpaginated text → ~10000-word windows, ~2000-word overlap.
|
||||
|
||||
The manifest is a cache derived from the filesystem + per-chunk state. Re-running
|
||||
this script is idempotent: existing chunk states (pending/assigned/done/rejected)
|
||||
survive as long as the source content hash is unchanged.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
if str(SCRIPT_DIR) not in sys.path:
|
||||
sys.path.insert(0, str(SCRIPT_DIR))
|
||||
|
||||
from extract_common import content_hash, split_pages # noqa: E402
|
||||
|
||||
SCHEMA_VERSION = "1.0"
|
||||
PAGES_PER_CHUNK = 20
|
||||
PAGE_OVERLAP = 4
|
||||
WORD_WINDOW = 10_000
|
||||
WORD_OVERLAP = 2_000
|
||||
|
||||
VALID_STATES = {"pending", "assigned", "done", "rejected"}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# header parsing
|
||||
# --------------------------------------------------------------------------
|
||||
def parse_source(text: str) -> tuple[dict, str]:
|
||||
"""Split a normalized source file into (header_dict, body)."""
|
||||
lines = text.splitlines()
|
||||
header: dict = {}
|
||||
body_start = 0
|
||||
in_header = True
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("--- PAGE "):
|
||||
body_start = i
|
||||
break
|
||||
if not in_header:
|
||||
continue
|
||||
if set(line.strip()) == {"="} and line.strip():
|
||||
body_start = i + 1
|
||||
in_header = False # header ends at the rule line
|
||||
continue
|
||||
if ":" in line:
|
||||
key, _, val = line.partition(":")
|
||||
header[key.strip()] = val.strip()
|
||||
body = "\n".join(lines[body_start:])
|
||||
return header, body
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# chunking — pure functions
|
||||
# --------------------------------------------------------------------------
|
||||
def chunk_pages(
|
||||
pages: list[tuple[int, str]],
|
||||
pages_per_chunk: int = PAGES_PER_CHUNK,
|
||||
overlap: int = PAGE_OVERLAP,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Split an ordered list of (page_no, text) into overlapping chunks.
|
||||
|
||||
stride = pages_per_chunk - overlap. Because stride < pages_per_chunk - 1, any
|
||||
activity straddling a page boundary appears whole in at least one chunk.
|
||||
"""
|
||||
if not pages:
|
||||
return []
|
||||
stride = max(1, pages_per_chunk - overlap)
|
||||
chunks: list[dict] = []
|
||||
i = 0
|
||||
n = len(pages)
|
||||
while i < n:
|
||||
window = pages[i : i + pages_per_chunk]
|
||||
first, last = window[0][0], window[-1][0]
|
||||
text = "".join(
|
||||
f"\n--- PAGE {num} ---\n{txt}\n" for num, txt in window
|
||||
)
|
||||
chunks.append(
|
||||
{"page_start": first, "page_end": last,
|
||||
"chunk_range": f"pages {first}-{last}", "text": text}
|
||||
)
|
||||
if i + pages_per_chunk >= n:
|
||||
break
|
||||
i += stride
|
||||
return chunks
|
||||
|
||||
|
||||
def chunk_words(
|
||||
text: str, window: int = WORD_WINDOW, overlap: int = WORD_OVERLAP
|
||||
) -> list[dict]:
|
||||
"""Split unpaginated text into overlapping word windows."""
|
||||
words = text.split()
|
||||
if not words:
|
||||
return []
|
||||
stride = max(1, window - overlap)
|
||||
chunks: list[dict] = []
|
||||
i = 0
|
||||
n = len(words)
|
||||
while i < n:
|
||||
seg = words[i : i + window]
|
||||
chunks.append(
|
||||
{"word_start": i, "word_end": i + len(seg),
|
||||
"chunk_range": f"words {i}-{i + len(seg)}", "text": " ".join(seg)}
|
||||
)
|
||||
if i + window >= n:
|
||||
break
|
||||
i += stride
|
||||
return chunks
|
||||
|
||||
|
||||
def make_chunks(source_text: str) -> list[dict]:
|
||||
"""Chunk one normalized source file. Picks page- or word-windowing."""
|
||||
_, body = parse_source(source_text)
|
||||
pages = split_pages(body)
|
||||
if pages:
|
||||
return chunk_pages(pages)
|
||||
return chunk_words(body)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# manifest
|
||||
# --------------------------------------------------------------------------
|
||||
def _empty_manifest() -> dict:
|
||||
return {"schema_version": SCHEMA_VERSION, "chunks": {}}
|
||||
|
||||
|
||||
def load_manifest(manifest_path: Path) -> dict:
|
||||
if manifest_path.exists():
|
||||
try:
|
||||
data = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
data.setdefault("schema_version", SCHEMA_VERSION)
|
||||
data.setdefault("chunks", {})
|
||||
return data
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
return _empty_manifest()
|
||||
|
||||
|
||||
def save_manifest(manifest: dict, manifest_path: Path) -> None:
|
||||
manifest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
manifest_path.write_text(
|
||||
json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8"
|
||||
)
|
||||
|
||||
|
||||
def chunk_source_file(
|
||||
source_path: Path, chunks_dir: Path, manifest: dict
|
||||
) -> list[str]:
|
||||
"""
|
||||
Chunk one data/sources/<id>.txt → data/chunks/<id>/<id>.partNN.txt and
|
||||
register every chunk in `manifest`. Preserves prior state when the source
|
||||
content hash is unchanged. Returns the list of chunk keys written.
|
||||
"""
|
||||
source_id = source_path.stem
|
||||
text = source_path.read_text(encoding="utf-8", errors="replace")
|
||||
src_hash = content_hash(text)
|
||||
chunks = make_chunks(text)
|
||||
|
||||
out_dir = chunks_dir / source_id
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
written: list[str] = []
|
||||
for idx, chunk in enumerate(chunks, 1):
|
||||
key = f"{source_id}.part{idx:02d}"
|
||||
chunk_file = out_dir / f"{key}.txt"
|
||||
chunk_file.write_text(chunk["text"], encoding="utf-8")
|
||||
|
||||
prior = manifest["chunks"].get(key)
|
||||
# preserve state only if the source content is unchanged
|
||||
if prior and prior.get("source_hash") == src_hash and \
|
||||
prior.get("state") in VALID_STATES:
|
||||
state = prior["state"]
|
||||
else:
|
||||
state = "pending"
|
||||
|
||||
manifest["chunks"][key] = {
|
||||
"source_id": source_id,
|
||||
"source_hash": src_hash,
|
||||
"part": idx,
|
||||
"chunk_range": chunk["chunk_range"],
|
||||
"chunk_file": str(chunk_file.relative_to(chunks_dir.parent)),
|
||||
"expected_json": f"{key}.json",
|
||||
"state": state,
|
||||
}
|
||||
written.append(key)
|
||||
return written
|
||||
|
||||
|
||||
def prune_stale(manifest: dict, live_keys: set[str]) -> list[str]:
|
||||
"""Drop manifest entries whose chunk no longer exists on disk."""
|
||||
stale = [k for k in manifest["chunks"] if k not in live_keys]
|
||||
for k in stale:
|
||||
del manifest["chunks"][k]
|
||||
return stale
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# CLI
|
||||
# --------------------------------------------------------------------------
|
||||
def run(sources_dir: Path, chunks_dir: Path) -> dict:
|
||||
"""Chunk every *.txt in sources_dir. Returns a summary dict."""
|
||||
manifest_path = chunks_dir / "manifest.json"
|
||||
manifest = load_manifest(manifest_path)
|
||||
|
||||
live_keys: set[str] = set()
|
||||
source_files = sorted(sources_dir.glob("*.txt"))
|
||||
for src in source_files:
|
||||
live_keys.update(chunk_source_file(src, chunks_dir, manifest))
|
||||
|
||||
stale = prune_stale(manifest, live_keys)
|
||||
save_manifest(manifest, manifest_path)
|
||||
|
||||
states: dict[str, int] = {}
|
||||
for meta in manifest["chunks"].values():
|
||||
states[meta["state"]] = states.get(meta["state"], 0) + 1
|
||||
return {
|
||||
"sources": len(source_files),
|
||||
"chunks": len(live_keys),
|
||||
"pruned": len(stale),
|
||||
"states": states,
|
||||
}
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Chunk normalized sources.")
|
||||
parser.add_argument("--sources", default="data/sources", help="sources dir")
|
||||
parser.add_argument("--chunks", default="data/chunks", help="chunks output dir")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
summary = run(Path(args.sources), Path(args.chunks))
|
||||
print(f"sources processed : {summary['sources']}")
|
||||
print(f"chunks written : {summary['chunks']}")
|
||||
print(f"stale pruned : {summary['pruned']}")
|
||||
for state, count in sorted(summary["states"].items()):
|
||||
print(f" {state:<10}: {count}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -1,54 +0,0 @@
|
||||
# TEMPLATE PENTRU EXTRACȚIE ACTIVITĂȚI CU CLAUDE
|
||||
|
||||
## Instrucțiuni pentru Claude Code:
|
||||
|
||||
Pentru fiecare PDF/DOC, folosește următorul format de extracție:
|
||||
|
||||
### 1. Citește fișierul:
|
||||
```
|
||||
Claude, te rog citește fișierul: [CALE_FISIER]
|
||||
```
|
||||
|
||||
### 2. Extrage activitățile folosind acest template JSON:
|
||||
```json
|
||||
{
|
||||
"source_file": "[NUME_FISIER]",
|
||||
"activities": [
|
||||
{
|
||||
"name": "Numele activității",
|
||||
"description": "Descrierea completă a activității",
|
||||
"rules": "Regulile jocului/activității",
|
||||
"variations": "Variante sau adaptări",
|
||||
"category": "[A-H] bazat pe tip",
|
||||
"age_group_min": 6,
|
||||
"age_group_max": 14,
|
||||
"participants_min": 4,
|
||||
"participants_max": 20,
|
||||
"duration_min": 10,
|
||||
"duration_max": 30,
|
||||
"materials_list": "Lista materialelor necesare",
|
||||
"skills_developed": "Competențe dezvoltate",
|
||||
"difficulty_level": "Ușor/Mediu/Dificil",
|
||||
"keywords": "cuvinte cheie separate prin virgulă",
|
||||
"tags": "taguri relevante"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Salvează în fișier:
|
||||
După extracție, salvează JSON-ul în: `/scripts/extracted_activities/[NUME_FISIER].json`
|
||||
|
||||
### 4. Priorități de procesare:
|
||||
|
||||
**TOP PRIORITY (procesează primele):**
|
||||
1. 1000 Fantastic Scout Games.pdf
|
||||
2. Cartea Mare a jocurilor.pdf
|
||||
3. 160-de-activitati-dinamice-jocuri-pentru-team-building.pdf
|
||||
4. 101 Ways to Create an Unforgettable Camp Experience.pdf
|
||||
5. 151 Awesome Summer Camp Nature Activities.pdf
|
||||
|
||||
**Categorii de focus:**
|
||||
- [A] Jocuri Cercetășești
|
||||
- [C] Camping & Activități Exterior
|
||||
- [G] Activități Educaționale
|
||||
@@ -1,164 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
DATABASE SETUP SCRIPT - INDEX-SISTEM-JOCURI
|
||||
|
||||
Script pentru recrearea bazelor de date din .gitignore
|
||||
Folosește clasele DatabaseManager pentru consistență
|
||||
|
||||
Usage:
|
||||
python scripts/create_databases.py
|
||||
python scripts/create_databases.py --clear-existing
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path so we can import our modules
|
||||
sys.path.append(str(Path(__file__).parent.parent / 'src'))
|
||||
|
||||
from database import DatabaseManager
|
||||
from game_library_manager import GameLibraryManager
|
||||
|
||||
def create_main_database(db_path: str = "data/activities.db", clear: bool = False):
|
||||
"""Create the main activities database"""
|
||||
db_file = Path(db_path)
|
||||
|
||||
if clear and db_file.exists():
|
||||
print(f"🗑️ Removing existing database: {db_path}")
|
||||
db_file.unlink()
|
||||
|
||||
print(f"📊 Creating main database: {db_path}")
|
||||
db = DatabaseManager(db_path)
|
||||
|
||||
# Test the database
|
||||
try:
|
||||
stats = db.get_statistics()
|
||||
print(f"✅ Database created successfully: {stats['total_activities']} activities")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Error creating database: {e}")
|
||||
return False
|
||||
|
||||
def create_game_library_database(db_path: str = "data/game_library.db", clear: bool = False):
|
||||
"""Create the legacy game library database"""
|
||||
db_file = Path(db_path)
|
||||
|
||||
if clear and db_file.exists():
|
||||
print(f"🗑️ Removing existing database: {db_path}")
|
||||
db_file.unlink()
|
||||
|
||||
print(f"📊 Creating game library database: {db_path}")
|
||||
manager = GameLibraryManager(db_path)
|
||||
|
||||
print(f"✅ Game library database created successfully")
|
||||
return True
|
||||
|
||||
def create_test_database(db_path: str = "data/test_activities.db", clear: bool = False):
|
||||
"""Create the test database"""
|
||||
db_file = Path(db_path)
|
||||
|
||||
if clear and db_file.exists():
|
||||
print(f"🗑️ Removing existing database: {db_path}")
|
||||
db_file.unlink()
|
||||
|
||||
print(f"📊 Creating test database: {db_path}")
|
||||
db = DatabaseManager(db_path)
|
||||
|
||||
# Add some test data
|
||||
test_activity = {
|
||||
'title': 'Test Activity - Setup Script',
|
||||
'description': 'This is a test activity created by the setup script',
|
||||
'file_path': 'test/sample.txt',
|
||||
'file_type': 'TXT',
|
||||
'category': 'test',
|
||||
'age_group': '8-12 ani',
|
||||
'participants': '5-10 persoane',
|
||||
'duration': '15-30min',
|
||||
'materials': 'Fără materiale',
|
||||
'tags': '["test", "setup"]',
|
||||
'source_text': 'Sample test content for verification'
|
||||
}
|
||||
|
||||
try:
|
||||
db.insert_activity(test_activity)
|
||||
stats = db.get_statistics()
|
||||
print(f"✅ Test database created with sample data: {stats['total_activities']} activities")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Error creating test database: {e}")
|
||||
return False
|
||||
|
||||
def ensure_data_directory():
|
||||
"""Ensure the data directory exists"""
|
||||
data_dir = Path("data")
|
||||
if not data_dir.exists():
|
||||
print(f"📁 Creating data directory: {data_dir}")
|
||||
data_dir.mkdir(parents=True)
|
||||
else:
|
||||
print(f"📁 Data directory exists: {data_dir}")
|
||||
|
||||
def main():
|
||||
"""Main setup function"""
|
||||
parser = argparse.ArgumentParser(description='Create databases for INDEX-SISTEM-JOCURI')
|
||||
parser.add_argument('--clear-existing', '-c', action='store_true',
|
||||
help='Remove existing databases before creating new ones')
|
||||
parser.add_argument('--main-only', action='store_true',
|
||||
help='Create only the main activities database')
|
||||
parser.add_argument('--test-only', action='store_true',
|
||||
help='Create only the test database')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("🚀 DATABASE SETUP - INDEX-SISTEM-JOCURI")
|
||||
print("=" * 50)
|
||||
|
||||
# Ensure data directory exists
|
||||
ensure_data_directory()
|
||||
|
||||
success_count = 0
|
||||
total_count = 0
|
||||
|
||||
if args.test_only:
|
||||
total_count = 1
|
||||
if create_test_database(clear=args.clear_existing):
|
||||
success_count += 1
|
||||
elif args.main_only:
|
||||
total_count = 1
|
||||
if create_main_database(clear=args.clear_existing):
|
||||
success_count += 1
|
||||
else:
|
||||
# Create all databases
|
||||
databases = [
|
||||
("Main activities", lambda: create_main_database(clear=args.clear_existing)),
|
||||
("Game library", lambda: create_game_library_database(clear=args.clear_existing)),
|
||||
("Test activities", lambda: create_test_database(clear=args.clear_existing))
|
||||
]
|
||||
|
||||
total_count = len(databases)
|
||||
|
||||
for name, create_func in databases:
|
||||
print(f"\n📂 Creating {name} database...")
|
||||
try:
|
||||
if create_func():
|
||||
success_count += 1
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to create {name} database: {e}")
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print(f"🎯 SUMMARY: {success_count}/{total_count} databases created successfully")
|
||||
|
||||
if success_count == total_count:
|
||||
print("✅ All databases ready!")
|
||||
print("\nNext steps:")
|
||||
print("1. Run indexer: cd src && python indexer.py --clear-db")
|
||||
print("2. Start web app: cd src && python app.py")
|
||||
else:
|
||||
print("⚠️ Some databases failed to create. Check errors above.")
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
361
scripts/extract_common.py
Normal file
361
scripts/extract_common.py
Normal file
@@ -0,0 +1,361 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
extract_common.py — single home for per-format text extraction.
|
||||
|
||||
Every extractor returns a plain text *body* with synthetic page markers
|
||||
(`--- PAGE N ---`). The file-level header (`SOURCE:` / `CONVERTED:`) is added
|
||||
by normalize_sources.py, not here.
|
||||
|
||||
Critical fix vs. the old pdf_to_text_converter.py: there is NO `max_pages` cap.
|
||||
Large books are extracted in full.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import importlib
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
|
||||
PAGE_MARKER_RE = re.compile(r"^--- PAGE (\d+) ---\s*$", re.MULTILINE)
|
||||
|
||||
# paragraphs per synthetic page for paginated-by-flow formats (docx)
|
||||
DOCX_PARAS_PER_PAGE = 40
|
||||
|
||||
# formats we deliberately ignore (epub duplicates existing PDFs — plan §1)
|
||||
IGNORED_EXTENSIONS = {".epub"}
|
||||
|
||||
# obvious junk filenames skipped during a walk
|
||||
JUNK_NAMES = {"desktop.ini", "linkuri-jocuri.txt"}
|
||||
JUNK_SUFFIXES = {".bak", ".tmp", ".ini"}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# page assembly helpers
|
||||
# --------------------------------------------------------------------------
|
||||
def join_pages(pages: list[str], start: int = 1) -> str:
|
||||
"""Join a list of page texts into a body string with `--- PAGE N ---`."""
|
||||
out: list[str] = []
|
||||
for i, text in enumerate(pages, start):
|
||||
out.append(f"\n--- PAGE {i} ---\n{(text or '').strip()}\n")
|
||||
return "".join(out)
|
||||
|
||||
|
||||
def split_pages(body: str) -> list[tuple[int, str]]:
|
||||
"""Inverse of join_pages: parse a body into [(page_number, text), ...]."""
|
||||
matches = list(PAGE_MARKER_RE.finditer(body))
|
||||
if not matches:
|
||||
return []
|
||||
pages: list[tuple[int, str]] = []
|
||||
for idx, m in enumerate(matches):
|
||||
num = int(m.group(1))
|
||||
seg_start = m.end()
|
||||
seg_end = matches[idx + 1].start() if idx + 1 < len(matches) else len(body)
|
||||
pages.append((num, body[seg_start:seg_end].strip()))
|
||||
return pages
|
||||
|
||||
|
||||
def count_page_markers(body: str) -> int:
|
||||
return len(PAGE_MARKER_RE.findall(body))
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# format detection
|
||||
# --------------------------------------------------------------------------
|
||||
FORMAT_BY_EXT = {
|
||||
".pdf": "pdf",
|
||||
".docx": "docx",
|
||||
".doc": "doc",
|
||||
".pptx": "pptx",
|
||||
".ppt": "pptx",
|
||||
".htm": "html",
|
||||
".html": "html",
|
||||
".zip": "zip",
|
||||
".epub": "epub",
|
||||
".txt": "txt",
|
||||
}
|
||||
|
||||
|
||||
def detect_format(path: str | os.PathLike) -> str:
|
||||
"""Return a format key for a path based on its extension."""
|
||||
ext = Path(path).suffix.lower()
|
||||
return FORMAT_BY_EXT.get(ext, "unknown")
|
||||
|
||||
|
||||
def is_junk(path: str | os.PathLike) -> bool:
|
||||
p = Path(path)
|
||||
name = p.name.lower()
|
||||
if name in JUNK_NAMES:
|
||||
return True
|
||||
if name.startswith("readme") and p.suffix.lower() == ".md":
|
||||
return True
|
||||
if p.suffix.lower() in JUNK_SUFFIXES:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# content hashing + near-duplicate elimination
|
||||
# --------------------------------------------------------------------------
|
||||
def _normalize_for_hash(text: str) -> str:
|
||||
return re.sub(r"\s+", " ", (text or "")).strip().lower()
|
||||
|
||||
|
||||
def content_hash(text: str) -> str:
|
||||
"""Stable SHA1 of whitespace-normalized text — used for exact-dup detection."""
|
||||
return hashlib.sha1(_normalize_for_hash(text).encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def near_duplicate_ratio(a: str, b: str) -> float:
|
||||
"""Similarity score in [0, 100] between two texts (rapidfuzz token ratio)."""
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
return fuzz.token_sort_ratio(_normalize_for_hash(a), _normalize_for_hash(b))
|
||||
|
||||
|
||||
def dedupe_texts(
|
||||
items: list[tuple[str, str]], threshold: float = 95.0
|
||||
) -> list[tuple[str, str]]:
|
||||
"""
|
||||
Drop exact and near-duplicate texts from a list of (key, text) pairs.
|
||||
|
||||
Used for HTML mirror pages (print copies, repeated index/footer pages).
|
||||
Keeps the first occurrence; O(n) on exact hash, O(n*k) fuzzy only against
|
||||
already-kept items.
|
||||
"""
|
||||
kept: list[tuple[str, str]] = []
|
||||
seen_hashes: set[str] = set()
|
||||
for key, text in items:
|
||||
h = content_hash(text)
|
||||
if h in seen_hashes:
|
||||
continue
|
||||
if any(near_duplicate_ratio(text, kt) >= threshold for _, kt in kept):
|
||||
continue
|
||||
seen_hashes.add(h)
|
||||
kept.append((key, text))
|
||||
return kept
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# preflight dependency check
|
||||
# --------------------------------------------------------------------------
|
||||
REQUIRED_PYTHON_MODULES = {
|
||||
"pdfplumber": "pdfplumber",
|
||||
"PyPDF2": "pypdf2",
|
||||
"docx": "python-docx",
|
||||
"pptx": "python-pptx",
|
||||
"bs4": "beautifulsoup4",
|
||||
"lxml": "lxml",
|
||||
"jsonschema": "jsonschema",
|
||||
"rapidfuzz": "rapidfuzz",
|
||||
"chardet": "chardet",
|
||||
}
|
||||
|
||||
|
||||
def preflight(check_ocr: bool = False) -> dict:
|
||||
"""
|
||||
Check system + Python dependencies before a long normalization run.
|
||||
|
||||
Returns {'ok': bool, 'missing_python': [...], 'missing_system': [...],
|
||||
'warnings': [...]}. libreoffice is a *warning* (only .doc needs it),
|
||||
tesseract only checked when check_ocr=True.
|
||||
"""
|
||||
missing_python: list[str] = []
|
||||
for module, pip_name in REQUIRED_PYTHON_MODULES.items():
|
||||
try:
|
||||
importlib.import_module(module)
|
||||
except ImportError:
|
||||
missing_python.append(pip_name)
|
||||
|
||||
warnings: list[str] = []
|
||||
missing_system: list[str] = []
|
||||
|
||||
if not (shutil.which("libreoffice") or shutil.which("soffice")):
|
||||
warnings.append("libreoffice not found — legacy .doc files cannot be converted")
|
||||
|
||||
if check_ocr and not shutil.which("tesseract"):
|
||||
missing_system.append("tesseract (OCR requested but not installed)")
|
||||
|
||||
return {
|
||||
"ok": not missing_python and not missing_system,
|
||||
"missing_python": missing_python,
|
||||
"missing_system": missing_system,
|
||||
"warnings": warnings,
|
||||
}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# per-format extractors
|
||||
# --------------------------------------------------------------------------
|
||||
def extract_pdf(path: str | os.PathLike) -> str:
|
||||
"""PDF → body. pdfplumber primary, PyPDF2 fallback. No page cap."""
|
||||
path = str(path)
|
||||
try:
|
||||
return _extract_pdf_pdfplumber(path)
|
||||
except Exception:
|
||||
return _extract_pdf_pypdf2(path)
|
||||
|
||||
|
||||
def _extract_pdf_pdfplumber(path: str) -> str:
|
||||
import pdfplumber
|
||||
|
||||
pages: list[str] = []
|
||||
with pdfplumber.open(path) as pdf:
|
||||
for page in pdf.pages: # ALL pages — no max_pages
|
||||
try:
|
||||
pages.append(page.extract_text() or "")
|
||||
except Exception:
|
||||
pages.append("")
|
||||
return join_pages(pages)
|
||||
|
||||
|
||||
def _extract_pdf_pypdf2(path: str) -> str:
|
||||
import PyPDF2
|
||||
|
||||
pages: list[str] = []
|
||||
with open(path, "rb") as fh:
|
||||
reader = PyPDF2.PdfReader(fh)
|
||||
for page in reader.pages: # ALL pages — no max_pages
|
||||
try:
|
||||
pages.append(page.extract_text() or "")
|
||||
except Exception:
|
||||
pages.append("")
|
||||
return join_pages(pages)
|
||||
|
||||
|
||||
def extract_docx(path: str | os.PathLike) -> str:
|
||||
"""docx → body. Synthetic page marker every DOCX_PARAS_PER_PAGE paragraphs."""
|
||||
import docx
|
||||
|
||||
document = docx.Document(str(path))
|
||||
paragraphs = [p.text for p in document.paragraphs]
|
||||
pages: list[str] = []
|
||||
for i in range(0, max(len(paragraphs), 1), DOCX_PARAS_PER_PAGE):
|
||||
chunk = paragraphs[i : i + DOCX_PARAS_PER_PAGE]
|
||||
pages.append("\n".join(chunk))
|
||||
return join_pages(pages)
|
||||
|
||||
|
||||
def extract_doc(path: str | os.PathLike) -> str:
|
||||
"""
|
||||
Legacy .doc → body via `libreoffice --headless --convert-to docx`.
|
||||
|
||||
Raises RuntimeError if libreoffice is unavailable — the caller marks the
|
||||
resulting source `needs_review` regardless (conversion is imperfect).
|
||||
"""
|
||||
soffice = shutil.which("libreoffice") or shutil.which("soffice")
|
||||
if not soffice:
|
||||
raise RuntimeError("libreoffice/soffice not available — cannot convert .doc")
|
||||
|
||||
src = Path(path).resolve()
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
subprocess.run(
|
||||
[soffice, "--headless", "--convert-to", "docx", "--outdir", tmp, str(src)],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
timeout=300,
|
||||
)
|
||||
converted = Path(tmp) / (src.stem + ".docx")
|
||||
if not converted.exists():
|
||||
raise RuntimeError(f"libreoffice produced no output for {src.name}")
|
||||
return extract_docx(converted)
|
||||
|
||||
|
||||
def extract_pptx(path: str | os.PathLike) -> str:
|
||||
"""pptx → body. One page per slide: title + body text + speaker notes."""
|
||||
from pptx import Presentation
|
||||
|
||||
presentation = Presentation(str(path))
|
||||
pages: list[str] = []
|
||||
for slide in presentation.slides:
|
||||
parts: list[str] = []
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame and shape.text_frame.text.strip():
|
||||
parts.append(shape.text_frame.text.strip())
|
||||
if slide.has_notes_slide:
|
||||
notes = slide.notes_slide.notes_text_frame.text.strip()
|
||||
if notes:
|
||||
parts.append(f"[NOTES] {notes}")
|
||||
pages.append("\n".join(parts))
|
||||
return join_pages(pages)
|
||||
|
||||
|
||||
def extract_html(path: str | os.PathLike) -> str:
|
||||
"""HTML mirror page → body. Strips nav/script/style/footer/header/aside."""
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
raw = Path(path).read_bytes()
|
||||
enc = chardet.detect(raw).get("encoding") or "utf-8"
|
||||
soup = BeautifulSoup(raw.decode(enc, errors="replace"), "lxml")
|
||||
|
||||
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
|
||||
tag.decompose()
|
||||
# also drop common chrome by role/class
|
||||
for tag in soup.find_all(attrs={"role": ["navigation", "banner", "contentinfo"]}):
|
||||
tag.decompose()
|
||||
|
||||
text = soup.get_text(separator="\n")
|
||||
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
||||
return join_pages(["\n".join(lines)])
|
||||
|
||||
|
||||
def extract_zip(path: str | os.PathLike) -> str:
|
||||
"""
|
||||
zip → body. Unzips into a temp dir and recurses on every extractable inner
|
||||
file. Inner files are page-renumbered into one continuous body.
|
||||
"""
|
||||
path = str(path)
|
||||
pages: list[str] = []
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
try:
|
||||
with zipfile.ZipFile(path) as zf:
|
||||
zf.extractall(tmp)
|
||||
except zipfile.BadZipFile:
|
||||
return ""
|
||||
for inner in sorted(Path(tmp).rglob("*")):
|
||||
if not inner.is_file() or is_junk(inner):
|
||||
continue
|
||||
fmt = detect_format(inner)
|
||||
if fmt in ("unknown", "epub", "zip"):
|
||||
# nested zips handled by recursion below
|
||||
if fmt == "zip":
|
||||
body = extract_zip(inner)
|
||||
pages.extend(t for _, t in split_pages(body))
|
||||
continue
|
||||
try:
|
||||
body = extract_file(inner)
|
||||
except Exception:
|
||||
continue
|
||||
pages.extend(t for _, t in split_pages(body))
|
||||
return join_pages(pages)
|
||||
|
||||
|
||||
EXTRACTORS: dict[str, Callable[[str | os.PathLike], str]] = {
|
||||
"pdf": extract_pdf,
|
||||
"docx": extract_docx,
|
||||
"doc": extract_doc,
|
||||
"pptx": extract_pptx,
|
||||
"html": extract_html,
|
||||
"zip": extract_zip,
|
||||
}
|
||||
|
||||
|
||||
def extract_file(path: str | os.PathLike) -> str:
|
||||
"""Dispatch a single file to the right extractor. Returns a page-marked body."""
|
||||
fmt = detect_format(path)
|
||||
if fmt == "txt":
|
||||
body = Path(path).read_text(encoding="utf-8", errors="replace")
|
||||
# already paginated? pass through; else wrap as one page
|
||||
return body if count_page_markers(body) else join_pages([body])
|
||||
extractor = EXTRACTORS.get(fmt)
|
||||
if extractor is None:
|
||||
raise ValueError(f"No extractor for format '{fmt}': {path}")
|
||||
return extractor(path)
|
||||
@@ -1,424 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HTML Activity Extractor - Proceseaz 1876 fiiere HTML
|
||||
Extrage automat activiti folosind pattern recognition
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
from pathlib import Path
|
||||
from bs4 import BeautifulSoup
|
||||
import chardet
|
||||
from typing import List, Dict, Optional
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
|
||||
class HTMLActivityExtractor:
|
||||
def __init__(self, db_path='data/activities.db'):
|
||||
self.db_path = db_path
|
||||
# Pattern-uri pentru detectare activiti <20>n rom<6F>n
|
||||
self.activity_patterns = {
|
||||
'title_patterns': [
|
||||
r'(?i)(joc|activitate|exerci[t]iu|team[\s-]?building|energizer|ice[\s-]?breaker)[\s:]+([^\.]{5,100})',
|
||||
r'(?i)<h[1-6][^>]*>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</h[1-6]>',
|
||||
r'(?i)<strong>([^<]*(?:joc|activitate|exerci[t]iu)[^<]*)</strong>',
|
||||
r'(?i)^[\d]+\.?\s*([A-Z][^\.]{10,100}(?:joc|activitate|exerci[t]iu)[^\.]{0,50})$',
|
||||
],
|
||||
'description_markers': [
|
||||
'descriere', 'reguli', 'cum se joac[a]', 'instructiuni',
|
||||
'obiectiv', 'desfasurare', 'explicatie', 'mod de joc'
|
||||
],
|
||||
'materials_markers': [
|
||||
'materiale', 'necesare', 'echipament', 'ce avem nevoie',
|
||||
'se folosesc', 'trebuie sa avem', 'dotari'
|
||||
],
|
||||
'age_patterns': [
|
||||
r'(?i)v[<5B>a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
|
||||
r'(?i)(\d+)[\s-]+(\d+)\s*ani',
|
||||
r'(?i)pentru\s+(\d+)[\s-]+(\d+)\s*ani',
|
||||
r'(?i)categoria?\s*(?:de\s*)?v[<5B>a]rst[a][\s:]+(\d+)[\s-]+(\d+)',
|
||||
],
|
||||
'participants_patterns': [
|
||||
r'(?i)(\d+)[\s-]+(\d+)\s*(?:participan[t]i|juc[a]tori|persoane|copii)',
|
||||
r'(?i)num[a]r\s*(?:de\s*)?(?:participan[t]i|juc[a]tori)[\s:]+(\d+)[\s-]+(\d+)',
|
||||
r'(?i)grup\s*de\s*(\d+)[\s-]+(\d+)',
|
||||
],
|
||||
'duration_patterns': [
|
||||
r'(?i)durat[a][\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
|
||||
r'(?i)timp[\s:]+(\d+)[\s-]+(\d+)\s*(?:minute|min)',
|
||||
r'(?i)(\d+)[\s-]+(\d+)\s*minute',
|
||||
]
|
||||
}
|
||||
|
||||
# Categorii predefinite bazate pe sistemul existent
|
||||
self.categories = {
|
||||
'[A]': ['joc', 'joaca', 'distractie', 'amuzament'],
|
||||
'[B]': ['aventura', 'explorare', 'descoperire'],
|
||||
'[C]': ['camping', 'tabara', 'excursie', 'drumetie'],
|
||||
'[D]': ['foc', 'flacara', 'lumina'],
|
||||
'[E]': ['noduri', 'fr<EFBFBD>nghii', 'sfori', 'legare'],
|
||||
'[F]': ['bushcraft', 'supravietuire', 'survival'],
|
||||
'[G]': ['educatie', 'educativ', 'invatare', 'scoala'],
|
||||
'[H]': ['orientare', 'busola', 'harta', 'navigare']
|
||||
}
|
||||
|
||||
def detect_encoding(self, file_path):
|
||||
"""Detecteaz encoding-ul fiierului"""
|
||||
with open(file_path, 'rb') as f:
|
||||
result = chardet.detect(f.read())
|
||||
return result['encoding'] or 'utf-8'
|
||||
|
||||
def extract_from_html(self, html_path: str) -> List[Dict]:
|
||||
"""Extrage activiti dintr-un singur fiier HTML"""
|
||||
activities = []
|
||||
|
||||
try:
|
||||
# Detectare encoding i citire
|
||||
encoding = self.detect_encoding(html_path)
|
||||
with open(html_path, 'r', encoding=encoding, errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
soup = BeautifulSoup(content, 'lxml')
|
||||
|
||||
# Metod 1: Caut liste de activiti
|
||||
activities.extend(self._extract_from_lists(soup, html_path))
|
||||
|
||||
# Metod 2: Caut activiti <20>n headings
|
||||
activities.extend(self._extract_from_headings(soup, html_path))
|
||||
|
||||
# Metod 3: Caut pattern-uri <20>n text
|
||||
activities.extend(self._extract_from_patterns(soup, html_path))
|
||||
|
||||
# Metod 4: Caut <20>n tabele
|
||||
activities.extend(self._extract_from_tables(soup, html_path))
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {html_path}: {e}")
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_lists(self, soup, source_file):
|
||||
"""Extrage activiti din liste HTML (ul, ol)"""
|
||||
activities = []
|
||||
|
||||
for list_elem in soup.find_all(['ul', 'ol']):
|
||||
# Verific dac lista pare s conin activiti
|
||||
list_text = list_elem.get_text().lower()
|
||||
if any(marker in list_text for marker in ['joc', 'activitate', 'exercitiu']):
|
||||
for li in list_elem.find_all('li'):
|
||||
text = li.get_text(strip=True)
|
||||
if len(text) > 20: # Minim 20 caractere pentru o activitate valid
|
||||
activity = self._create_activity_from_text(text, source_file)
|
||||
if activity:
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_headings(self, soup, source_file):
|
||||
"""Extrage activiti bazate pe headings"""
|
||||
activities = []
|
||||
|
||||
for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
|
||||
heading_text = heading.get_text(strip=True)
|
||||
|
||||
# Verific dac heading-ul conine cuvinte cheie
|
||||
if any(keyword in heading_text.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
|
||||
# Caut descrierea <20>n elementele urmtoare
|
||||
description = ""
|
||||
next_elem = heading.find_next_sibling()
|
||||
|
||||
while next_elem and next_elem.name not in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
if next_elem.name in ['p', 'div', 'ul']:
|
||||
description += next_elem.get_text(strip=True) + " "
|
||||
if len(description) > 500: # Limit descriere
|
||||
break
|
||||
next_elem = next_elem.find_next_sibling()
|
||||
|
||||
if description:
|
||||
activity = {
|
||||
'name': heading_text[:200],
|
||||
'description': description[:1000],
|
||||
'source_file': str(source_file),
|
||||
'category': self._detect_category(heading_text + " " + description)
|
||||
}
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_patterns(self, soup, source_file):
|
||||
"""Extrage activiti folosind pattern matching"""
|
||||
activities = []
|
||||
text = soup.get_text()
|
||||
|
||||
# Caut pattern-uri de activiti
|
||||
for pattern in self.activity_patterns['title_patterns']:
|
||||
matches = re.finditer(pattern, text, re.MULTILINE)
|
||||
for match in matches:
|
||||
title = match.group(0) if match.lastindex == 0 else match.group(match.lastindex)
|
||||
if len(title) > 10:
|
||||
# Extrage context <20>n jurul match-ului
|
||||
start = max(0, match.start() - 200)
|
||||
end = min(len(text), match.end() + 500)
|
||||
context = text[start:end]
|
||||
|
||||
activity = self._create_activity_from_text(context, source_file, title)
|
||||
if activity:
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_tables(self, soup, source_file):
|
||||
"""Extrage activiti din tabele"""
|
||||
activities = []
|
||||
|
||||
for table in soup.find_all('table'):
|
||||
rows = table.find_all('tr')
|
||||
if len(rows) > 1: # Cel puin header i o linie de date
|
||||
# Detecteaz coloanele relevante
|
||||
headers = [th.get_text(strip=True).lower() for th in rows[0].find_all(['th', 'td'])]
|
||||
|
||||
for row in rows[1:]:
|
||||
cells = row.find_all(['td'])
|
||||
if cells:
|
||||
activity_data = {}
|
||||
for i, cell in enumerate(cells):
|
||||
if i < len(headers):
|
||||
activity_data[headers[i]] = cell.get_text(strip=True)
|
||||
|
||||
# Creeaz activitate din date tabel
|
||||
if any(key in activity_data for key in ['joc', 'activitate', 'nume', 'titlu']):
|
||||
activity = self._create_activity_from_table_data(activity_data, source_file)
|
||||
if activity:
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _create_activity_from_text(self, text, source_file, title=None):
|
||||
"""Creeaz un dicionar de activitate din text"""
|
||||
if not text or len(text) < 30:
|
||||
return None
|
||||
|
||||
activity = {
|
||||
'name': title or text[:100].split('.')[0].strip(),
|
||||
'description': text[:1000],
|
||||
'source_file': str(source_file),
|
||||
'category': self._detect_category(text),
|
||||
'keywords': self._extract_keywords(text),
|
||||
'created_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Extrage metadata suplimentar
|
||||
activity.update(self._extract_metadata(text))
|
||||
|
||||
return activity
|
||||
|
||||
def _create_activity_from_table_data(self, data, source_file):
|
||||
"""Creeaz activitate din date de tabel"""
|
||||
activity = {
|
||||
'source_file': str(source_file),
|
||||
'created_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Mapare c<>mpuri tabel la c<>mpuri DB
|
||||
field_mapping = {
|
||||
'nume': 'name', 'titlu': 'name', 'joc': 'name', 'activitate': 'name',
|
||||
'descriere': 'description', 'detalii': 'description', 'explicatie': 'description',
|
||||
'materiale': 'materials_list', 'echipament': 'materials_list',
|
||||
'varsta': 'age_group_min', 'categoria': 'category',
|
||||
'participanti': 'participants_min', 'numar': 'participants_min',
|
||||
'durata': 'duration_min', 'timp': 'duration_min'
|
||||
}
|
||||
|
||||
for table_field, db_field in field_mapping.items():
|
||||
if table_field in data:
|
||||
activity[db_field] = data[table_field]
|
||||
|
||||
# Validare minim
|
||||
if 'name' in activity and len(activity.get('name', '')) > 5:
|
||||
return activity
|
||||
|
||||
return None
|
||||
|
||||
def _extract_metadata(self, text):
|
||||
"""Extrage metadata din text folosind pattern-uri"""
|
||||
metadata = {}
|
||||
|
||||
# Extrage v<>rsta
|
||||
for pattern in self.activity_patterns['age_patterns']:
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
metadata['age_group_min'] = int(match.group(1))
|
||||
metadata['age_group_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
|
||||
break
|
||||
|
||||
# Extrage numr participani
|
||||
for pattern in self.activity_patterns['participants_patterns']:
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
metadata['participants_min'] = int(match.group(1))
|
||||
metadata['participants_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
|
||||
break
|
||||
|
||||
# Extrage durata
|
||||
for pattern in self.activity_patterns['duration_patterns']:
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
metadata['duration_min'] = int(match.group(1))
|
||||
metadata['duration_max'] = int(match.group(2)) if match.lastindex >= 2 else int(match.group(1))
|
||||
break
|
||||
|
||||
# Extrage materiale
|
||||
materials = []
|
||||
text_lower = text.lower()
|
||||
for marker in self.activity_patterns['materials_markers']:
|
||||
idx = text_lower.find(marker)
|
||||
if idx != -1:
|
||||
# Extrage urmtoarele 200 caractere dup marker
|
||||
materials_text = text[idx:idx+200]
|
||||
# Extrage items din list
|
||||
items = re.findall(r'[-"]\s*([^\n-"]+)', materials_text)
|
||||
if items:
|
||||
materials.extend(items)
|
||||
|
||||
if materials:
|
||||
metadata['materials_list'] = ', '.join(materials[:10]) # Maxim 10 materiale
|
||||
|
||||
return metadata
|
||||
|
||||
def _detect_category(self, text):
|
||||
"""Detecteaz categoria activitii bazat pe cuvinte cheie"""
|
||||
text_lower = text.lower()
|
||||
|
||||
for category, keywords in self.categories.items():
|
||||
if any(keyword in text_lower for keyword in keywords):
|
||||
return category
|
||||
|
||||
return '[A]' # Default categoria jocuri
|
||||
|
||||
def _extract_keywords(self, text):
|
||||
"""Extrage cuvinte cheie din text"""
|
||||
keywords = []
|
||||
text_lower = text.lower()
|
||||
|
||||
# Lista de cuvinte cheie relevante
|
||||
keyword_list = [
|
||||
'cooperare', 'competitie', 'echipa', 'creativitate', 'miscare',
|
||||
'strategie', 'comunicare', 'incredere', 'coordonare', 'atentie',
|
||||
'reflexe', 'logica', 'imaginatie', 'muzica', 'dans', 'sport',
|
||||
'natura', 'mediu', 'stiinta', 'matematica', 'limba', 'cultura'
|
||||
]
|
||||
|
||||
for keyword in keyword_list:
|
||||
if keyword in text_lower:
|
||||
keywords.append(keyword)
|
||||
|
||||
return ', '.join(keywords[:5]) # Maxim 5 keywords
|
||||
|
||||
def save_to_database(self, activities):
|
||||
"""Salveaz activitile <20>n baza de date"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
saved_count = 0
|
||||
duplicate_count = 0
|
||||
|
||||
for activity in activities:
|
||||
try:
|
||||
# Verific duplicate
|
||||
cursor.execute(
|
||||
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
|
||||
(activity.get('name'), activity.get('source_file'))
|
||||
)
|
||||
|
||||
if cursor.fetchone():
|
||||
duplicate_count += 1
|
||||
continue
|
||||
|
||||
# Pregtete valorile pentru insert
|
||||
columns = []
|
||||
values = []
|
||||
placeholders = []
|
||||
|
||||
for key, value in activity.items():
|
||||
if key != 'created_at': # Skip created_at, it has default
|
||||
columns.append(key)
|
||||
values.append(value)
|
||||
placeholders.append('?')
|
||||
|
||||
# Insert <20>n DB
|
||||
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
|
||||
cursor.execute(query, values)
|
||||
saved_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error saving activity: {e}")
|
||||
continue
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
return saved_count, duplicate_count
|
||||
|
||||
def process_all_html_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
|
||||
"""Proceseaz toate fiierele HTML din directorul specificat"""
|
||||
base_path = Path(base_path)
|
||||
html_files = list(base_path.rglob("*.html"))
|
||||
html_files.extend(list(base_path.rglob("*.htm")))
|
||||
|
||||
print(f"Found {len(html_files)} HTML files to process")
|
||||
|
||||
all_activities = []
|
||||
processed = 0
|
||||
errors = 0
|
||||
|
||||
for i, html_file in enumerate(html_files):
|
||||
try:
|
||||
activities = self.extract_from_html(str(html_file))
|
||||
all_activities.extend(activities)
|
||||
processed += 1
|
||||
|
||||
# Progress update
|
||||
if (i + 1) % 100 == 0:
|
||||
print(f"Progress: {i+1}/{len(html_files)} files processed, {len(all_activities)} activities found")
|
||||
# Save batch to DB
|
||||
if all_activities:
|
||||
saved, dupes = self.save_to_database(all_activities)
|
||||
print(f"Batch saved: {saved} new activities, {dupes} duplicates skipped")
|
||||
all_activities = [] # Clear buffer
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {html_file}: {e}")
|
||||
errors += 1
|
||||
|
||||
# Save remaining activities
|
||||
if all_activities:
|
||||
saved, dupes = self.save_to_database(all_activities)
|
||||
print(f"Final batch saved: {saved} new activities, {dupes} duplicates skipped")
|
||||
|
||||
print(f"\nProcessing complete!")
|
||||
print(f"Files processed: {processed}")
|
||||
print(f"Errors: {errors}")
|
||||
|
||||
return processed, errors
|
||||
|
||||
# Funcie main pentru test
|
||||
if __name__ == "__main__":
|
||||
extractor = HTMLActivityExtractor()
|
||||
|
||||
# Test pe un fiier sample mai <20>nt<6E>i
|
||||
print("Testing on sample file first...")
|
||||
# Gsete un fiier HTML pentru test
|
||||
test_files = list(Path('/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri').rglob("*.html"))[:3]
|
||||
|
||||
for test_file in test_files:
|
||||
print(f"\nTesting: {test_file}")
|
||||
activities = extractor.extract_from_html(str(test_file))
|
||||
print(f"Found {len(activities)} activities")
|
||||
if activities:
|
||||
print(f"Sample activity: {activities[0]['name'][:50]}...")
|
||||
|
||||
# <20>ntreab dac s continue cu procesarea complet
|
||||
response = input("\nContinue with full processing? (y/n): ")
|
||||
if response.lower() == 'y':
|
||||
extractor.process_all_html_files()
|
||||
@@ -1,78 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Import activities extracted by Claude from JSON files
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
class ClaudeActivityImporter:
|
||||
def __init__(self, db_path='data/activities.db'):
|
||||
self.db_path = db_path
|
||||
self.json_dir = Path('scripts/extracted_activities')
|
||||
self.json_dir.mkdir(exist_ok=True)
|
||||
|
||||
def import_json_file(self, json_path):
|
||||
"""Import activities from a single JSON file"""
|
||||
with open(json_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
source_file = data.get('source_file', str(json_path))
|
||||
activities = data.get('activities', [])
|
||||
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
imported = 0
|
||||
for activity in activities:
|
||||
try:
|
||||
# Add source file and timestamp
|
||||
activity['source_file'] = source_file
|
||||
activity['created_at'] = datetime.now().isoformat()
|
||||
|
||||
# Prepare insert
|
||||
columns = list(activity.keys())
|
||||
values = list(activity.values())
|
||||
placeholders = ['?' for _ in values]
|
||||
|
||||
# Check for duplicate
|
||||
cursor.execute(
|
||||
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
|
||||
(activity.get('name'), source_file)
|
||||
)
|
||||
|
||||
if not cursor.fetchone():
|
||||
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
|
||||
cursor.execute(query, values)
|
||||
imported += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error importing activity: {e}")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
print(f"Imported {imported} activities from {json_path.name}")
|
||||
return imported
|
||||
|
||||
def import_all_json_files(self):
|
||||
"""Import all JSON files from the extracted_activities directory"""
|
||||
json_files = list(self.json_dir.glob("*.json"))
|
||||
|
||||
if not json_files:
|
||||
print("No JSON files found in extracted_activities directory")
|
||||
return 0
|
||||
|
||||
total_imported = 0
|
||||
for json_file in json_files:
|
||||
imported = self.import_json_file(json_file)
|
||||
total_imported += imported
|
||||
|
||||
print(f"\nTotal imported: {total_imported} activities from {len(json_files)} files")
|
||||
return total_imported
|
||||
|
||||
if __name__ == "__main__":
|
||||
importer = ClaudeActivityImporter()
|
||||
importer.import_all_json_files()
|
||||
179
scripts/import_common.py
Normal file
179
scripts/import_common.py
Normal file
@@ -0,0 +1,179 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
import_common.py — shared helpers for the import / validation side of the
|
||||
extraction pipeline (Lane C).
|
||||
|
||||
Used by build_database.py and validate_extractions.py:
|
||||
* JSON-schema validation of subagent extraction files,
|
||||
* the anti-hallucination source_excerpt substring check (E5),
|
||||
* locating the source chunk that an extraction file came from,
|
||||
* the stable content key used by the needs_review queue.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
REPO_ROOT = SCRIPT_DIR.parent
|
||||
|
||||
DEFAULT_SCHEMA_PATH = SCRIPT_DIR / "activity_schema.json"
|
||||
|
||||
# rapidfuzz.partial_ratio is on a 0..100 scale; an excerpt counts as a real
|
||||
# quote from the source when it scores at least this against the chunk text.
|
||||
EXCERPT_MATCH_THRESHOLD = 90.0
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# schema validation
|
||||
# --------------------------------------------------------------------------
|
||||
def load_schema(schema_path: str | Path = DEFAULT_SCHEMA_PATH) -> dict:
|
||||
"""Load the activity JSON schema produced by Lane A."""
|
||||
return json.loads(Path(schema_path).read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def validate_extraction(data: Any, schema: dict) -> list[str]:
|
||||
"""
|
||||
Validate one parsed extraction file against `schema`.
|
||||
|
||||
Returns a list of human-readable error strings; empty list == valid.
|
||||
"""
|
||||
import jsonschema
|
||||
|
||||
validator = jsonschema.Draft7Validator(schema)
|
||||
errors: list[str] = []
|
||||
for err in sorted(validator.iter_errors(data), key=lambda e: list(e.path)):
|
||||
location = "/".join(str(p) for p in err.path) or "<root>"
|
||||
errors.append(f"{location}: {err.message}")
|
||||
return errors
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# excerpt verification (E5 — anti-hallucination)
|
||||
# --------------------------------------------------------------------------
|
||||
def _normalize_text(text: str) -> str:
|
||||
return re.sub(r"\s+", " ", (text or "")).strip().lower()
|
||||
|
||||
|
||||
def excerpt_score(excerpt: str, chunk_text: str) -> float:
|
||||
"""Best fuzzy-substring score (0..100) of `excerpt` inside `chunk_text`."""
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
if not excerpt or not chunk_text:
|
||||
return 0.0
|
||||
return float(fuzz.partial_ratio(_normalize_text(excerpt), _normalize_text(chunk_text)))
|
||||
|
||||
|
||||
def excerpt_matches(
|
||||
excerpt: str, chunk_text: str, threshold: float = EXCERPT_MATCH_THRESHOLD
|
||||
) -> bool:
|
||||
"""True when `excerpt` appears (fuzzily) as a substring of `chunk_text`."""
|
||||
return excerpt_score(excerpt, chunk_text) >= threshold
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# locating the source chunk an extraction file came from
|
||||
# --------------------------------------------------------------------------
|
||||
def chunk_key_for(json_path: Path, header: Optional[dict]) -> str:
|
||||
"""
|
||||
Resolve the chunk key for an extraction file.
|
||||
|
||||
Prefers the explicit `chunk_key` in the header, otherwise falls back to the
|
||||
JSON file stem (extraction files are named `<chunk_key>.json`).
|
||||
"""
|
||||
if header and header.get("chunk_key"):
|
||||
return str(header["chunk_key"])
|
||||
return json_path.stem
|
||||
|
||||
|
||||
def source_id_for(chunk_key: str, header: Optional[dict]) -> str:
|
||||
"""Resolve the source id; `<source_id>.partNN` → `<source_id>`."""
|
||||
if header and header.get("source_id"):
|
||||
return str(header["source_id"])
|
||||
# chunk keys look like "<source_id>.partNN"
|
||||
return chunk_key.rsplit(".part", 1)[0]
|
||||
|
||||
|
||||
def find_chunk_text(
|
||||
json_path: Path, header: Optional[dict], chunks_dir: Path
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Return the text of the source chunk for an extraction file, or None.
|
||||
|
||||
Looks for data/chunks/<source_id>/<chunk_key>.txt, then falls back to a
|
||||
recursive glob on the chunk key.
|
||||
"""
|
||||
chunk_key = chunk_key_for(json_path, header)
|
||||
source_id = source_id_for(chunk_key, header)
|
||||
|
||||
candidate = chunks_dir / source_id / f"{chunk_key}.txt"
|
||||
if candidate.is_file():
|
||||
return candidate.read_text(encoding="utf-8", errors="replace")
|
||||
|
||||
matches = list(chunks_dir.rglob(f"{chunk_key}.txt"))
|
||||
if matches:
|
||||
return matches[0].read_text(encoding="utf-8", errors="replace")
|
||||
return None
|
||||
|
||||
|
||||
def source_path_for(source_id: str, sources_dir: Path) -> Optional[str]:
|
||||
"""
|
||||
Read the original `SOURCE:` path from a normalized source header.
|
||||
|
||||
data/sources/<source_id>.txt starts with a `SOURCE: <relative path>` line.
|
||||
"""
|
||||
src_file = sources_dir / f"{source_id}.txt"
|
||||
if not src_file.is_file():
|
||||
return None
|
||||
try:
|
||||
with src_file.open(encoding="utf-8", errors="replace") as fh:
|
||||
for line in fh:
|
||||
if line.startswith("SOURCE:"):
|
||||
return line.split(":", 1)[1].strip()
|
||||
if line.startswith("=") or line.startswith("--- PAGE "):
|
||||
break
|
||||
except OSError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# stable content key for the needs_review queue (plan §5c)
|
||||
# --------------------------------------------------------------------------
|
||||
def normalize_name(name: str) -> str:
|
||||
"""Diacritic-free, lowercased, whitespace-collapsed name (dedup key)."""
|
||||
if not name:
|
||||
return ""
|
||||
decomposed = unicodedata.normalize("NFKD", name)
|
||||
ascii_str = "".join(c for c in decomposed if not unicodedata.combining(c))
|
||||
return re.sub(r"\s+", " ", ascii_str.lower().strip())
|
||||
|
||||
|
||||
def content_key(normalized_name: str, language: Optional[str], description: str) -> str:
|
||||
"""
|
||||
Stable hash identifying a row for the review queue.
|
||||
|
||||
Only borderline-kept-separate rows and legacy `.doc` rows ever carry
|
||||
needs_review, and neither is auto-merged — so their (normalized_name,
|
||||
language, description) triple is stable across rebuilds.
|
||||
"""
|
||||
payload = f"{normalized_name}\x1f{language or ''}\x1f{_normalize_text(description)}"
|
||||
return hashlib.sha1(payload.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# iteration
|
||||
# --------------------------------------------------------------------------
|
||||
def iter_extraction_files(extracted_dir: Path):
|
||||
"""Yield every *.json directly under `extracted_dir` (skips _rejected/)."""
|
||||
if not extracted_dir.is_dir():
|
||||
return
|
||||
for path in sorted(extracted_dir.glob("*.json")):
|
||||
if path.is_file():
|
||||
yield path
|
||||
255
scripts/normalize_sources.py
Normal file
255
scripts/normalize_sources.py
Normal file
@@ -0,0 +1,255 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
normalize_sources.py — walk data/carti-camp-jocuri/ and write data/sources/<id>.txt.
|
||||
|
||||
Output files keep the existing header format:
|
||||
|
||||
SOURCE: <original relative path>
|
||||
CONVERTED: <iso date>
|
||||
FORMAT: <pdf|docx|doc|pptx|html-mirror|zip>
|
||||
NEEDS_REVIEW: <reason> (optional — legacy .doc conversions)
|
||||
==================================================
|
||||
|
||||
--- PAGE 1 ---
|
||||
...
|
||||
|
||||
Each source gets a stable id = <8-hex hash of relative path>_<sanitized stem>,
|
||||
so two files with the same name in different folders never collide.
|
||||
|
||||
The pipeline is script-only: this normalizes formats, it does NOT run extraction.
|
||||
Run `--check-deps` before a long job.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import datetime as _dt
|
||||
import hashlib
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
if str(SCRIPT_DIR) not in sys.path:
|
||||
sys.path.insert(0, str(SCRIPT_DIR))
|
||||
|
||||
from extract_common import ( # noqa: E402
|
||||
count_page_markers,
|
||||
dedupe_texts,
|
||||
detect_format,
|
||||
extract_file,
|
||||
extract_html,
|
||||
is_junk,
|
||||
join_pages,
|
||||
preflight,
|
||||
split_pages,
|
||||
)
|
||||
|
||||
HEADER_RULE = "=" * 50
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# stable source id
|
||||
# --------------------------------------------------------------------------
|
||||
def sanitize_stem(stem: str) -> str:
|
||||
s = re.sub(r"[^\w]+", "_", stem, flags=re.UNICODE).strip("_").lower()
|
||||
return s[:60] or "source"
|
||||
|
||||
|
||||
def stable_id(relative_path: str | Path) -> str:
|
||||
"""Collision-proof id derived from the path relative to the corpus root."""
|
||||
rel = str(relative_path).replace("\\", "/")
|
||||
digest = hashlib.sha1(rel.encode("utf-8")).hexdigest()[:8]
|
||||
stem = sanitize_stem(Path(rel).stem)
|
||||
return f"{digest}_{stem}"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# header
|
||||
# --------------------------------------------------------------------------
|
||||
def build_header(
|
||||
source_rel: str, fmt: str, needs_review: str | None = None
|
||||
) -> str:
|
||||
today = _dt.date.today().isoformat()
|
||||
lines = [
|
||||
f"SOURCE: {source_rel}",
|
||||
f"CONVERTED: {today}",
|
||||
f"FORMAT: {fmt}",
|
||||
]
|
||||
if needs_review:
|
||||
lines.append(f"NEEDS_REVIEW: {needs_review}")
|
||||
lines.append(HEADER_RULE)
|
||||
return "\n".join(lines) + "\n\n"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# mirror-site directories
|
||||
# --------------------------------------------------------------------------
|
||||
MIRROR_PAGE_EXTS = {".html", ".htm"}
|
||||
|
||||
|
||||
def is_mirror_dir(path: Path) -> bool:
|
||||
"""A directory counts as a site mirror if it contains HTML pages."""
|
||||
if not path.is_dir():
|
||||
return False
|
||||
if path.name.endswith("_files"):
|
||||
return False
|
||||
return any(
|
||||
p.suffix.lower() in MIRROR_PAGE_EXTS
|
||||
for p in path.rglob("*")
|
||||
if p.is_file()
|
||||
)
|
||||
|
||||
|
||||
def normalize_mirror(mirror_dir: Path) -> str:
|
||||
"""Extract every HTML page in a mirror dir, dedupe near-duplicates, join."""
|
||||
pages: list[tuple[str, str]] = []
|
||||
for html in sorted(mirror_dir.rglob("*")):
|
||||
if not html.is_file() or html.suffix.lower() not in MIRROR_PAGE_EXTS:
|
||||
continue
|
||||
if "_files" in html.parts:
|
||||
continue
|
||||
try:
|
||||
body = extract_html(html)
|
||||
except Exception:
|
||||
continue
|
||||
text = "\n".join(t for _, t in split_pages(body))
|
||||
if text.strip():
|
||||
pages.append((str(html.relative_to(mirror_dir)), text))
|
||||
pages = dedupe_texts(pages)
|
||||
return join_pages([t for _, t in pages])
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# one source
|
||||
# --------------------------------------------------------------------------
|
||||
def normalize_one(
|
||||
path: Path, corpus_root: Path, out_dir: Path
|
||||
) -> dict | None:
|
||||
"""
|
||||
Normalize a single file or mirror directory → data/sources/<id>.txt.
|
||||
|
||||
Returns a result dict, or None if the entry was skipped (junk / ignored).
|
||||
"""
|
||||
rel = path.relative_to(corpus_root)
|
||||
sid = stable_id(rel)
|
||||
|
||||
if path.is_dir():
|
||||
if not is_mirror_dir(path):
|
||||
return None
|
||||
fmt, needs_review = "html-mirror", None
|
||||
body = normalize_mirror(path)
|
||||
else:
|
||||
if is_junk(path):
|
||||
return None
|
||||
fmt = detect_format(path)
|
||||
if fmt in ("unknown", "epub", "txt"):
|
||||
return None # epub duplicates PDFs; txt is not a source format here
|
||||
needs_review = "legacy .doc conversion is imperfect" if fmt == "doc" else None
|
||||
try:
|
||||
body = extract_file(path)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return {"id": sid, "source": str(rel), "status": "error", "error": str(exc)}
|
||||
|
||||
if not body.strip():
|
||||
return {"id": sid, "source": str(rel), "status": "empty"}
|
||||
|
||||
out_path = out_dir / f"{sid}.txt"
|
||||
out_path.write_text(build_header(str(rel), fmt, needs_review) + body,
|
||||
encoding="utf-8")
|
||||
return {
|
||||
"id": sid,
|
||||
"source": str(rel),
|
||||
"status": "ok",
|
||||
"format": fmt,
|
||||
"pages": count_page_markers(body),
|
||||
"needs_review": bool(needs_review),
|
||||
}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# walk
|
||||
# --------------------------------------------------------------------------
|
||||
def iter_corpus_entries(corpus_root: Path):
|
||||
"""Yield top-level files and mirror directories under the corpus root."""
|
||||
for entry in sorted(corpus_root.iterdir()):
|
||||
if entry.name.startswith("."):
|
||||
continue
|
||||
if entry.is_dir():
|
||||
if is_mirror_dir(entry):
|
||||
yield entry
|
||||
else:
|
||||
yield entry
|
||||
|
||||
|
||||
def run(corpus_root: Path, out_dir: Path) -> dict:
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
results: list[dict] = []
|
||||
for entry in iter_corpus_entries(corpus_root):
|
||||
res = normalize_one(entry, corpus_root, out_dir)
|
||||
if res is not None:
|
||||
results.append(res)
|
||||
summary = {
|
||||
"total": len(results),
|
||||
"ok": sum(1 for r in results if r["status"] == "ok"),
|
||||
"errors": sum(1 for r in results if r["status"] == "error"),
|
||||
"empty": sum(1 for r in results if r["status"] == "empty"),
|
||||
"needs_review": sum(1 for r in results if r.get("needs_review")),
|
||||
"results": results,
|
||||
}
|
||||
return summary
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# CLI
|
||||
# --------------------------------------------------------------------------
|
||||
def print_preflight(report: dict) -> int:
|
||||
print("Dependency preflight")
|
||||
print("--------------------")
|
||||
if report["missing_python"]:
|
||||
print(" MISSING Python packages: " + ", ".join(report["missing_python"]))
|
||||
else:
|
||||
print(" Python packages: OK")
|
||||
if report["missing_system"]:
|
||||
print(" MISSING system tools : " + ", ".join(report["missing_system"]))
|
||||
for w in report["warnings"]:
|
||||
print(f" WARNING: {w}")
|
||||
print(" => " + ("READY" if report["ok"] else "NOT READY — install the above"))
|
||||
return 0 if report["ok"] else 1
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Normalize mixed sources to .txt")
|
||||
parser.add_argument("--corpus", default="data/carti-camp-jocuri",
|
||||
help="corpus root to walk")
|
||||
parser.add_argument("--out", default="data/sources", help="output directory")
|
||||
parser.add_argument("--check-deps", action="store_true",
|
||||
help="run dependency preflight and exit")
|
||||
parser.add_argument("--ocr", action="store_true",
|
||||
help="include OCR (tesseract) in the preflight check")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.check_deps:
|
||||
return print_preflight(preflight(check_ocr=args.ocr))
|
||||
|
||||
report = preflight(check_ocr=args.ocr)
|
||||
if report["missing_python"]:
|
||||
print_preflight(report)
|
||||
return 1
|
||||
for w in report["warnings"]:
|
||||
print(f"WARNING: {w}")
|
||||
|
||||
summary = run(Path(args.corpus), Path(args.out))
|
||||
print(f"normalized : {summary['ok']}/{summary['total']}")
|
||||
print(f"errors : {summary['errors']}")
|
||||
print(f"empty : {summary['empty']}")
|
||||
print(f"needs_review: {summary['needs_review']}")
|
||||
for r in summary["results"]:
|
||||
if r["status"] != "ok":
|
||||
print(f" [{r['status']}] {r['source']}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -1,143 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PDF Mass Conversion to Text for Activity Extraction
|
||||
Handles all PDF sizes efficiently with multiple fallback methods
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
import PyPDF2
|
||||
import pdfplumber
|
||||
from typing import List, Dict
|
||||
import logging
|
||||
|
||||
class PDFConverter:
|
||||
def __init__(self, max_pages=50):
|
||||
self.max_pages = max_pages
|
||||
self.conversion_stats = {}
|
||||
|
||||
def convert_pdf_to_text(self, pdf_path: str) -> str:
|
||||
"""Convert PDF to text using multiple methods with fallbacks"""
|
||||
try:
|
||||
# Method 1: pdfplumber (best for tables and layout)
|
||||
return self._convert_with_pdfplumber(pdf_path)
|
||||
except Exception as e:
|
||||
print(f"pdfplumber failed for {pdf_path}: {e}")
|
||||
|
||||
try:
|
||||
# Method 2: PyPDF2 (fallback)
|
||||
return self._convert_with_pypdf2(pdf_path)
|
||||
except Exception as e2:
|
||||
print(f"PyPDF2 also failed for {pdf_path}: {e2}")
|
||||
return ""
|
||||
|
||||
def _convert_with_pdfplumber(self, pdf_path: str) -> str:
|
||||
"""Primary conversion method using pdfplumber"""
|
||||
text_content = ""
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
total_pages = len(pdf.pages)
|
||||
pages_to_process = min(total_pages, self.max_pages)
|
||||
|
||||
print(f" Converting {pdf_path}: {pages_to_process}/{total_pages} pages")
|
||||
|
||||
for i, page in enumerate(pdf.pages[:pages_to_process]):
|
||||
try:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_content += f"\n--- PAGE {i+1} ---\n"
|
||||
text_content += page_text
|
||||
text_content += "\n"
|
||||
except Exception as e:
|
||||
print(f" Error on page {i+1}: {e}")
|
||||
continue
|
||||
|
||||
self.conversion_stats[pdf_path] = {
|
||||
'method': 'pdfplumber',
|
||||
'pages_processed': pages_to_process,
|
||||
'total_pages': total_pages,
|
||||
'success': True,
|
||||
'text_length': len(text_content)
|
||||
}
|
||||
|
||||
return text_content
|
||||
|
||||
def _convert_with_pypdf2(self, pdf_path: str) -> str:
|
||||
"""Fallback conversion method using PyPDF2"""
|
||||
text_content = ""
|
||||
|
||||
with open(pdf_path, 'rb') as file:
|
||||
reader = PyPDF2.PdfReader(file)
|
||||
total_pages = len(reader.pages)
|
||||
pages_to_process = min(total_pages, self.max_pages)
|
||||
|
||||
print(f" Converting {pdf_path} (fallback): {pages_to_process}/{total_pages} pages")
|
||||
|
||||
for i in range(pages_to_process):
|
||||
try:
|
||||
page = reader.pages[i]
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_content += f"\n--- PAGE {i+1} ---\n"
|
||||
text_content += page_text
|
||||
text_content += "\n"
|
||||
except Exception as e:
|
||||
print(f" Error on page {i+1}: {e}")
|
||||
continue
|
||||
|
||||
self.conversion_stats[pdf_path] = {
|
||||
'method': 'PyPDF2',
|
||||
'pages_processed': pages_to_process,
|
||||
'total_pages': total_pages,
|
||||
'success': True,
|
||||
'text_length': len(text_content)
|
||||
}
|
||||
|
||||
return text_content
|
||||
|
||||
def convert_all_pdfs(self, pdf_directory: str, output_directory: str):
|
||||
"""Convert all PDFs in directory to text files"""
|
||||
pdf_files = list(Path(pdf_directory).glob("**/*.pdf"))
|
||||
|
||||
print(f"🔄 Converting {len(pdf_files)} PDF files to text...")
|
||||
|
||||
os.makedirs(output_directory, exist_ok=True)
|
||||
|
||||
for i, pdf_path in enumerate(pdf_files):
|
||||
print(f"\n[{i+1}/{len(pdf_files)}] Processing {pdf_path.name}...")
|
||||
|
||||
# Convert to text
|
||||
text_content = self.convert_pdf_to_text(str(pdf_path))
|
||||
|
||||
if text_content.strip():
|
||||
# Save as text file
|
||||
output_file = Path(output_directory) / f"{pdf_path.stem}.txt"
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(f"SOURCE: {pdf_path}\n")
|
||||
f.write(f"CONVERTED: 2025-01-11\n")
|
||||
f.write("="*50 + "\n\n")
|
||||
f.write(text_content)
|
||||
|
||||
print(f" ✅ Saved: {output_file}")
|
||||
else:
|
||||
print(f" ❌ No text extracted from {pdf_path.name}")
|
||||
|
||||
# Save conversion statistics
|
||||
stats_file = Path(output_directory) / "conversion_stats.json"
|
||||
with open(stats_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.conversion_stats, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n🎉 PDF conversion complete! Check {output_directory}")
|
||||
return len([f for f in self.conversion_stats.values() if f['success']])
|
||||
|
||||
# Usage
|
||||
if __name__ == "__main__":
|
||||
converter = PDFConverter(max_pages=50)
|
||||
|
||||
# Convert all PDFs
|
||||
pdf_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri"
|
||||
output_dir = "/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri/INDEX-SISTEM-JOCURI/converted_pdfs"
|
||||
|
||||
converted_count = converter.convert_all_pdfs(pdf_dir, output_dir)
|
||||
print(f"Final result: {converted_count} PDFs successfully converted")
|
||||
145
scripts/review_queue.py
Normal file
145
scripts/review_queue.py
Normal file
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
review_queue.py — CLI for the needs_review lifecycle (plan §5c).
|
||||
|
||||
Rows land in the queue when dedup leaves a borderline pair separate, or when a
|
||||
legacy `.doc` source was converted imperfectly. Each row has a stable content
|
||||
key; a decision written here is stored in data/review_decisions.json (git
|
||||
tracked) and re-applied by build_database.py on every rebuild, so the queue
|
||||
never resurfaces a resolved row.
|
||||
|
||||
Commands:
|
||||
python scripts/review_queue.py list
|
||||
python scripts/review_queue.py resolve <id> <merge|keep-separate|drop>
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
REPO_ROOT = SCRIPT_DIR.parent
|
||||
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
|
||||
if _p not in sys.path:
|
||||
sys.path.insert(0, _p)
|
||||
|
||||
from import_common import content_key, normalize_name # noqa: E402
|
||||
|
||||
VALID_DECISIONS = ("merge", "keep-separate", "drop")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# review_decisions.json
|
||||
# --------------------------------------------------------------------------
|
||||
def load_decisions(path: Path) -> dict:
|
||||
if path.is_file():
|
||||
try:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def save_decisions(decisions: dict, path: Path) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(
|
||||
json.dumps(decisions, indent=2, ensure_ascii=False, sort_keys=True),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# queue
|
||||
# --------------------------------------------------------------------------
|
||||
def list_queue(db_path: Path) -> list[dict]:
|
||||
"""Return every needs_review row in the current DB, with its content key."""
|
||||
if not db_path.is_file():
|
||||
return []
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"SELECT name, normalized_name, language, description "
|
||||
"FROM activities WHERE needs_review = 1 ORDER BY normalized_name"
|
||||
).fetchall()
|
||||
except sqlite3.OperationalError:
|
||||
return []
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
out = []
|
||||
for row in rows:
|
||||
norm = row["normalized_name"] or normalize_name(row["name"])
|
||||
key = content_key(norm, row["language"], row["description"] or "")
|
||||
out.append({
|
||||
"id": key,
|
||||
"name": row["name"],
|
||||
"language": row["language"],
|
||||
"description": row["description"] or "",
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def resolve(decisions_path: Path, content_id: str, decision: str) -> dict:
|
||||
"""Record a decision for a content key in review_decisions.json."""
|
||||
if decision not in VALID_DECISIONS:
|
||||
raise ValueError(
|
||||
f"invalid decision {decision!r}; expected one of {VALID_DECISIONS}"
|
||||
)
|
||||
decisions = load_decisions(decisions_path)
|
||||
decisions[content_id] = {"decision": decision}
|
||||
save_decisions(decisions, decisions_path)
|
||||
return decisions
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# CLI
|
||||
# --------------------------------------------------------------------------
|
||||
def main(argv: Optional[list[str]] = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="needs_review queue CLI")
|
||||
parser.add_argument("--db", default="data/activities.db")
|
||||
parser.add_argument("--decisions", default="data/review_decisions.json")
|
||||
sub = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
sub.add_parser("list", help="list rows currently flagged needs_review")
|
||||
|
||||
p_resolve = sub.add_parser("resolve", help="record a decision for a row")
|
||||
p_resolve.add_argument("id", help="content id from `list`")
|
||||
p_resolve.add_argument("decision", choices=VALID_DECISIONS)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.command == "list":
|
||||
rows = list_queue(Path(args.db))
|
||||
if not rows:
|
||||
print("review queue is empty.")
|
||||
return 0
|
||||
print(f"{len(rows)} row(s) need review:\n")
|
||||
for r in rows:
|
||||
desc = r["description"][:80].replace("\n", " ")
|
||||
print(f" id : {r['id']}")
|
||||
print(f" name : {r['name']} [{r['language']}]")
|
||||
print(f" desc : {desc}")
|
||||
print(f" -> review_queue.py resolve {r['id']} <merge|keep-separate|drop>")
|
||||
print()
|
||||
return 0
|
||||
|
||||
if args.command == "resolve":
|
||||
resolve(Path(args.decisions), args.id, args.decision)
|
||||
print(f"recorded: {args.id} -> {args.decision}")
|
||||
print(f"written to {args.decisions} (applied on next build_database --rebuild)")
|
||||
return 0
|
||||
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -1,50 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Main extraction orchestrator
|
||||
Ruleaza intregul proces de extractie
|
||||
run_extraction.py — extraction orchestrator (plan §3).
|
||||
|
||||
The pipeline is script-only up to the LLM step: this script normalizes the
|
||||
corpus, chunks the normalized sources, and emits one subagent prompt per
|
||||
`pending` chunk. It does NOT run the extraction itself — that step is the
|
||||
interactive Claude Code orchestrator launching waves of subagents.
|
||||
|
||||
Steps:
|
||||
1. normalize data/carti-camp-jocuri/ -> data/sources/*.txt
|
||||
2. chunk data/sources/*.txt -> data/chunks/<id>/*.txt + manifest.json
|
||||
3. emit one prompt per `pending` chunk -> data/chunks/_prompts/*.md
|
||||
4. report how many chunks remain `pending`
|
||||
|
||||
Usage:
|
||||
python scripts/run_extraction.py
|
||||
python scripts/run_extraction.py --skip-normalize # re-chunk only
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from unified_processor import UnifiedProcessor
|
||||
from import_claude_activities import ClaudeActivityImporter
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
REPO_ROOT = SCRIPT_DIR.parent
|
||||
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
|
||||
if _p not in sys.path:
|
||||
sys.path.insert(0, _p)
|
||||
|
||||
import chunk_sources # noqa: E402
|
||||
import normalize_sources # noqa: E402
|
||||
|
||||
SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"
|
||||
|
||||
|
||||
def emit_chunk_prompt(chunk_key: str, meta: dict, prompts_dir: Path) -> Path:
|
||||
"""Write the subagent prompt for one pending chunk."""
|
||||
chunk_file = meta.get("chunk_file", f"data/chunks/<id>/{chunk_key}.txt")
|
||||
expected_json = meta.get("expected_json", f"{chunk_key}.json")
|
||||
text = "\n".join([
|
||||
f"# EXTRACTION — chunk `{chunk_key}`",
|
||||
"",
|
||||
f"Read ONLY this chunk: `{chunk_file}`",
|
||||
f"Chunk range: {meta.get('chunk_range', '?')}",
|
||||
"",
|
||||
f"Follow the rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
|
||||
"Identify every distinct activity, fill the schema "
|
||||
"(`scripts/activity_schema.json`), and write the result to:",
|
||||
"",
|
||||
f" data/extracted/{expected_json}",
|
||||
"",
|
||||
"Header fields to set: "
|
||||
f'source_id="{meta.get("source_id", "")}", chunk_key="{chunk_key}", '
|
||||
f'source_hash="{meta.get("source_hash", "")}".',
|
||||
"",
|
||||
])
|
||||
prompts_dir.mkdir(parents=True, exist_ok=True)
|
||||
out = prompts_dir / f"{chunk_key}.prompt.md"
|
||||
out.write_text(text, encoding="utf-8")
|
||||
return out
|
||||
|
||||
|
||||
def run(
|
||||
*,
|
||||
corpus_root: Path,
|
||||
sources_dir: Path,
|
||||
chunks_dir: Path,
|
||||
skip_normalize: bool = False,
|
||||
) -> dict:
|
||||
summary: dict = {}
|
||||
|
||||
if not skip_normalize:
|
||||
norm = normalize_sources.run(corpus_root, sources_dir)
|
||||
summary["normalized"] = {"ok": norm["ok"], "total": norm["total"],
|
||||
"errors": norm["errors"]}
|
||||
|
||||
chunk_summary = chunk_sources.run(sources_dir, chunks_dir)
|
||||
summary["chunks"] = chunk_summary
|
||||
|
||||
manifest_path = chunks_dir / "manifest.json"
|
||||
manifest = chunk_sources.load_manifest(manifest_path)
|
||||
prompts_dir = chunks_dir / "_prompts"
|
||||
|
||||
pending = {k: m for k, m in manifest["chunks"].items()
|
||||
if m.get("state") == "pending"}
|
||||
for key, meta in sorted(pending.items()):
|
||||
emit_chunk_prompt(key, meta, prompts_dir)
|
||||
|
||||
states: dict[str, int] = {}
|
||||
for m in manifest["chunks"].values():
|
||||
states[m.get("state", "?")] = states.get(m.get("state", "?"), 0) + 1
|
||||
summary["states"] = states
|
||||
summary["pending"] = len(pending)
|
||||
summary["prompts_dir"] = str(prompts_dir)
|
||||
return summary
|
||||
|
||||
|
||||
def main(argv: Optional[list[str]] = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Extraction orchestrator.")
|
||||
parser.add_argument("--corpus", default="data/carti-camp-jocuri")
|
||||
parser.add_argument("--sources", default="data/sources")
|
||||
parser.add_argument("--chunks", default="data/chunks")
|
||||
parser.add_argument("--skip-normalize", action="store_true",
|
||||
help="skip normalization, re-chunk existing sources only")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
summary = run(
|
||||
corpus_root=Path(args.corpus),
|
||||
sources_dir=Path(args.sources),
|
||||
chunks_dir=Path(args.chunks),
|
||||
skip_normalize=args.skip_normalize,
|
||||
)
|
||||
|
||||
print("=" * 60)
|
||||
print("EXTRACTION ORCHESTRATOR")
|
||||
print("=" * 60)
|
||||
if "normalized" in summary:
|
||||
n = summary["normalized"]
|
||||
print(f"normalized : {n['ok']}/{n['total']} (errors {n['errors']})")
|
||||
print(f"chunks : {summary['chunks']['chunks']}")
|
||||
for state, count in sorted(summary["states"].items()):
|
||||
print(f" {state:<10}: {count}")
|
||||
print(f"\npending chunks remaining : {summary['pending']}")
|
||||
if summary["pending"]:
|
||||
print(f"subagent prompts written : {summary['prompts_dir']}/")
|
||||
print("Launch waves of ~5-10 subagents on those prompts, then run "
|
||||
"validate_extractions.py and build_database.py --rebuild.")
|
||||
else:
|
||||
print("All chunks extracted — run build_database.py --rebuild.")
|
||||
print("=" * 60)
|
||||
return 0
|
||||
|
||||
def main():
|
||||
print("="*60)
|
||||
print("ACTIVITY EXTRACTION SYSTEM")
|
||||
print("Strategy S8: Hybrid Claude + Scripts")
|
||||
print("="*60)
|
||||
|
||||
# Step 1: Run automated extraction
|
||||
print("\nSTEP 1: Automated Extraction")
|
||||
print("-"*40)
|
||||
processor = UnifiedProcessor()
|
||||
processor.process_automated_formats()
|
||||
|
||||
# Step 2: Wait for Claude processing
|
||||
print("\n" + "="*60)
|
||||
print("STEP 2: Manual Claude Processing Required")
|
||||
print("-"*40)
|
||||
print("Please process PDF/DOC files with Claude using the template.")
|
||||
print("Files are listed in: pdf_doc_for_claude.txt")
|
||||
print("Save extracted activities as JSON in: scripts/extracted_activities/")
|
||||
print("="*60)
|
||||
|
||||
response = input("\nHave you completed Claude processing? (y/n): ")
|
||||
|
||||
if response.lower() == 'y':
|
||||
# Step 3: Import Claude-extracted activities
|
||||
print("\nSTEP 3: Importing Claude-extracted activities")
|
||||
print("-"*40)
|
||||
importer = ClaudeActivityImporter()
|
||||
importer.import_all_json_files()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("EXTRACTION COMPLETE!")
|
||||
print("="*60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
raise SystemExit(main())
|
||||
|
||||
@@ -1,197 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Text/Markdown Activity Extractor
|
||||
Proceseaza fisiere TXT si MD pentru extractie activitati
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
import sqlite3
|
||||
from datetime import datetime
|
||||
|
||||
class TextActivityExtractor:
|
||||
def __init__(self, db_path='data/activities.db'):
|
||||
self.db_path = db_path
|
||||
self.activity_patterns = {
|
||||
'section_headers': [
|
||||
r'^#{1,6}\s*(.+)$', # Markdown headers
|
||||
r'^([A-Z][^\.]{10,100})$', # Titluri simple
|
||||
r'^\d+\.\s*(.+)$', # Numbered lists
|
||||
r'^[•\-\*]\s*(.+)$', # Bullet points
|
||||
],
|
||||
'activity_markers': [
|
||||
'joc:', 'activitate:', 'exercitiu:', 'team building:',
|
||||
'nume:', 'titlu:', 'denumire:'
|
||||
]
|
||||
}
|
||||
|
||||
def extract_from_text(self, file_path: str) -> List[Dict]:
|
||||
"""Extrage activitati din fisier text/markdown"""
|
||||
activities = []
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
# Metoda 1: Cauta sectiuni markdown
|
||||
if file_path.endswith('.md'):
|
||||
activities.extend(self._extract_from_markdown(content, file_path))
|
||||
|
||||
# Metoda 2: Cauta pattern-uri generale
|
||||
activities.extend(self._extract_from_patterns(content, file_path))
|
||||
|
||||
# Metoda 3: Cauta blocuri de text structurate
|
||||
activities.extend(self._extract_from_blocks(content, file_path))
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {file_path}: {e}")
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_markdown(self, content, source_file):
|
||||
"""Extrage activitati din format markdown"""
|
||||
activities = []
|
||||
lines = content.split('\n')
|
||||
|
||||
current_activity = None
|
||||
current_content = []
|
||||
|
||||
for line in lines:
|
||||
# Verifica daca e header de activitate
|
||||
if re.match(r'^#{1,3}\s*(.+)', line):
|
||||
# Salveaza activitatea anterioara daca exista
|
||||
if current_activity and current_content:
|
||||
current_activity['description'] = '\n'.join(current_content[:20]) # Max 20 linii
|
||||
activities.append(current_activity)
|
||||
|
||||
# Verifica daca noul header e o activitate
|
||||
header_text = re.sub(r'^#{1,3}\s*', '', line)
|
||||
if any(marker in header_text.lower() for marker in ['joc', 'activitate', 'exercitiu']):
|
||||
current_activity = {
|
||||
'name': header_text[:200],
|
||||
'source_file': str(source_file),
|
||||
'category': '[A]'
|
||||
}
|
||||
current_content = []
|
||||
else:
|
||||
current_activity = None
|
||||
|
||||
elif current_activity:
|
||||
# Adauga continut la activitatea curenta
|
||||
if line.strip():
|
||||
current_content.append(line)
|
||||
|
||||
# Salveaza ultima activitate
|
||||
if current_activity and current_content:
|
||||
current_activity['description'] = '\n'.join(current_content[:20])
|
||||
activities.append(current_activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_patterns(self, content, source_file):
|
||||
"""Extrage folosind pattern matching"""
|
||||
activities = []
|
||||
|
||||
# Cauta markeri specifici de activitati
|
||||
for marker in self.activity_patterns['activity_markers']:
|
||||
pattern = re.compile(f'{re.escape(marker)}\\s*(.+?)(?=\\n\\n|{re.escape(marker)}|$)',
|
||||
re.IGNORECASE | re.DOTALL)
|
||||
matches = pattern.finditer(content)
|
||||
|
||||
for match in matches:
|
||||
activity_text = match.group(1)
|
||||
if len(activity_text) > 20:
|
||||
activity = {
|
||||
'name': activity_text.split('\n')[0][:200],
|
||||
'description': activity_text[:1000],
|
||||
'source_file': str(source_file),
|
||||
'category': '[A]'
|
||||
}
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def _extract_from_blocks(self, content, source_file):
|
||||
"""Extrage din blocuri de text separate"""
|
||||
activities = []
|
||||
|
||||
# Imparte in blocuri separate de linii goale
|
||||
blocks = re.split(r'\n\s*\n', content)
|
||||
|
||||
for block in blocks:
|
||||
if len(block) > 50: # Minim 50 caractere
|
||||
lines = block.strip().split('\n')
|
||||
first_line = lines[0].strip()
|
||||
|
||||
# Verifica daca blocul pare o activitate
|
||||
if any(keyword in first_line.lower() for keyword in ['joc', 'activitate', 'exercitiu']):
|
||||
activity = {
|
||||
'name': first_line[:200],
|
||||
'description': block[:1000],
|
||||
'source_file': str(source_file),
|
||||
'category': '[A]'
|
||||
}
|
||||
activities.append(activity)
|
||||
|
||||
return activities
|
||||
|
||||
def save_to_database(self, activities):
|
||||
"""Salveaza in baza de date"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
saved_count = 0
|
||||
|
||||
for activity in activities:
|
||||
try:
|
||||
# Check for duplicates
|
||||
cursor.execute(
|
||||
"SELECT id FROM activities WHERE name = ? AND source_file = ?",
|
||||
(activity.get('name'), activity.get('source_file'))
|
||||
)
|
||||
|
||||
if not cursor.fetchone():
|
||||
columns = list(activity.keys())
|
||||
values = list(activity.values())
|
||||
placeholders = ['?' for _ in values]
|
||||
|
||||
query = f"INSERT INTO activities ({', '.join(columns)}) VALUES ({', '.join(placeholders)})"
|
||||
cursor.execute(query, values)
|
||||
saved_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error saving: {e}")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
return saved_count
|
||||
|
||||
def process_all_text_files(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
|
||||
"""Proceseaza toate fisierele text si markdown"""
|
||||
base_path = Path(base_path)
|
||||
|
||||
text_files = list(base_path.rglob("*.txt"))
|
||||
md_files = list(base_path.rglob("*.md"))
|
||||
all_files = text_files + md_files
|
||||
|
||||
print(f"Found {len(all_files)} text/markdown files")
|
||||
|
||||
all_activities = []
|
||||
|
||||
for file_path in all_files:
|
||||
activities = self.extract_from_text(str(file_path))
|
||||
all_activities.extend(activities)
|
||||
print(f"Processed {file_path.name}: {len(activities)} activities")
|
||||
|
||||
# Save to database
|
||||
saved = self.save_to_database(all_activities)
|
||||
print(f"\nTotal saved: {saved} activities from {len(all_files)} files")
|
||||
|
||||
return len(all_files), saved
|
||||
|
||||
if __name__ == "__main__":
|
||||
extractor = TextActivityExtractor()
|
||||
extractor.process_all_text_files()
|
||||
@@ -1,151 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Unified Activity Processor
|
||||
Orchestreaz toate extractoarele pentru procesare complet
|
||||
"""
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
from html_extractor import HTMLActivityExtractor
|
||||
from text_extractor import TextActivityExtractor
|
||||
import sqlite3
|
||||
|
||||
class UnifiedProcessor:
|
||||
def __init__(self, db_path='data/activities.db'):
|
||||
self.db_path = db_path
|
||||
self.html_extractor = HTMLActivityExtractor(db_path)
|
||||
self.text_extractor = TextActivityExtractor(db_path)
|
||||
self.stats = {
|
||||
'html_processed': 0,
|
||||
'text_processed': 0,
|
||||
'pdf_to_process': 0,
|
||||
'doc_to_process': 0,
|
||||
'total_activities': 0,
|
||||
'start_time': None,
|
||||
'end_time': None
|
||||
}
|
||||
|
||||
def get_current_activity_count(self):
|
||||
"""Obine numrul curent de activiti din DB"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT COUNT(*) FROM activities")
|
||||
count = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
return count
|
||||
|
||||
def count_files_to_process(self, base_path):
|
||||
"""Numr fiierele care trebuie procesate"""
|
||||
base_path = Path(base_path)
|
||||
|
||||
counts = {
|
||||
'html': len(list(base_path.rglob("*.html"))) + len(list(base_path.rglob("*.htm"))),
|
||||
'txt': len(list(base_path.rglob("*.txt"))),
|
||||
'md': len(list(base_path.rglob("*.md"))),
|
||||
'pdf': len(list(base_path.rglob("*.pdf"))),
|
||||
'doc': len(list(base_path.rglob("*.doc"))),
|
||||
'docx': len(list(base_path.rglob("*.docx")))
|
||||
}
|
||||
|
||||
return counts
|
||||
|
||||
def process_automated_formats(self, base_path='/mnt/d/GoogleDrive/Cercetasi/carti-camp-jocuri'):
|
||||
"""Proceseaz toate formatele care pot fi automatizate"""
|
||||
print("="*60)
|
||||
print("UNIFIED ACTIVITY PROCESSOR - AUTOMATED PHASE")
|
||||
print("="*60)
|
||||
|
||||
self.stats['start_time'] = time.time()
|
||||
initial_count = self.get_current_activity_count()
|
||||
|
||||
# Afieaz statistici iniiale
|
||||
file_counts = self.count_files_to_process(base_path)
|
||||
print(f"\nFiles to process:")
|
||||
for format, count in file_counts.items():
|
||||
print(f" {format.upper()}: {count} files")
|
||||
print(f"\nCurrent activities in database: {initial_count}")
|
||||
print("-"*60)
|
||||
|
||||
# FAZA 1: Procesare HTML (prioritate maxim - volum mare)
|
||||
print("\n[1/2] Processing HTML files...")
|
||||
print("-"*40)
|
||||
html_processed, html_errors = self.html_extractor.process_all_html_files(base_path)
|
||||
self.stats['html_processed'] = html_processed
|
||||
|
||||
# FAZA 2: Procesare Text/MD
|
||||
print("\n[2/2] Processing Text/Markdown files...")
|
||||
print("-"*40)
|
||||
text_processed, text_saved = self.text_extractor.process_all_text_files(base_path)
|
||||
self.stats['text_processed'] = text_processed
|
||||
|
||||
# Statistici finale
|
||||
self.stats['end_time'] = time.time()
|
||||
final_count = self.get_current_activity_count()
|
||||
self.stats['total_activities'] = final_count - initial_count
|
||||
|
||||
# Identific fiierele care necesit procesare manual
|
||||
self.stats['pdf_to_process'] = file_counts['pdf']
|
||||
self.stats['doc_to_process'] = file_counts['doc'] + file_counts['docx']
|
||||
|
||||
self.print_summary()
|
||||
self.save_pdf_doc_list(base_path)
|
||||
|
||||
def print_summary(self):
|
||||
"""Afieaz rezumatul procesrii"""
|
||||
print("\n" + "="*60)
|
||||
print("PROCESSING SUMMARY")
|
||||
print("="*60)
|
||||
|
||||
duration = self.stats['end_time'] - self.stats['start_time']
|
||||
|
||||
print(f"\nAutomated Processing Results:")
|
||||
print(f" HTML files processed: {self.stats['html_processed']}")
|
||||
print(f" Text/MD files processed: {self.stats['text_processed']}")
|
||||
print(f" New activities added: {self.stats['total_activities']}")
|
||||
print(f" Processing time: {duration:.1f} seconds")
|
||||
|
||||
print(f"\nFiles requiring Claude processing:")
|
||||
print(f" PDF files: {self.stats['pdf_to_process']}")
|
||||
print(f" DOC/DOCX files: {self.stats['doc_to_process']}")
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("NEXT STEPS:")
|
||||
print("1. Review the file 'pdf_doc_for_claude.txt' for manual processing")
|
||||
print("2. Use Claude to extract activities from PDF/DOC files")
|
||||
print("3. Focus on largest PDF files first (highest activity density)")
|
||||
print("="*60)
|
||||
|
||||
def save_pdf_doc_list(self, base_path):
|
||||
"""Salveaz lista de PDF/DOC pentru procesare cu Claude"""
|
||||
base_path = Path(base_path)
|
||||
|
||||
pdf_files = sorted(base_path.rglob("*.pdf"), key=lambda p: p.stat().st_size, reverse=True)
|
||||
doc_files = list(base_path.rglob("*.doc"))
|
||||
docx_files = list(base_path.rglob("*.docx"))
|
||||
|
||||
with open('pdf_doc_for_claude.txt', 'w', encoding='utf-8') as f:
|
||||
f.write("PDF/DOC FILES FOR CLAUDE PROCESSING\n")
|
||||
f.write("="*60 + "\n")
|
||||
f.write("Files sorted by size (largest first = likely more activities)\n\n")
|
||||
|
||||
f.write("TOP PRIORITY PDF FILES (process these first):\n")
|
||||
f.write("-"*40 + "\n")
|
||||
for i, pdf in enumerate(pdf_files[:20], 1):
|
||||
size_mb = pdf.stat().st_size / (1024*1024)
|
||||
f.write(f"{i}. {pdf.name} ({size_mb:.1f} MB)\n")
|
||||
f.write(f" Path: {pdf}\n\n")
|
||||
|
||||
if len(pdf_files) > 20:
|
||||
f.write(f"\n... and {len(pdf_files)-20} more PDF files\n\n")
|
||||
|
||||
f.write("\nDOC/DOCX FILES:\n")
|
||||
f.write("-"*40 + "\n")
|
||||
for doc in doc_files + docx_files:
|
||||
size_kb = doc.stat().st_size / 1024
|
||||
f.write(f"- {doc.name} ({size_kb:.1f} KB)\n")
|
||||
|
||||
print(f"\nPDF/DOC list saved to: pdf_doc_for_claude.txt")
|
||||
|
||||
if __name__ == "__main__":
|
||||
processor = UnifiedProcessor()
|
||||
processor.process_automated_formats()
|
||||
208
scripts/validate_extractions.py
Normal file
208
scripts/validate_extractions.py
Normal file
@@ -0,0 +1,208 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
validate_extractions.py — validate every data/extracted/*.json (plan §5b).
|
||||
|
||||
For each extraction file it runs two checks:
|
||||
1. JSON-schema validation against scripts/activity_schema.json,
|
||||
2. the source_excerpt anti-hallucination check (each excerpt must be a fuzzy
|
||||
substring of the chunk it came from).
|
||||
|
||||
For every failing chunk it:
|
||||
* writes the exact re-extraction prompt to data/extracted/_reextract/<chunk>.prompt.md,
|
||||
* marks the chunk `rejected` in data/chunks/manifest.json.
|
||||
|
||||
The orchestrator then re-launches subagents only on the `rejected` chunks; the
|
||||
loop repeats until nothing is rejected.
|
||||
|
||||
Usage:
|
||||
python scripts/validate_extractions.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
REPO_ROOT = SCRIPT_DIR.parent
|
||||
for _p in (str(SCRIPT_DIR), str(REPO_ROOT)):
|
||||
if _p not in sys.path:
|
||||
sys.path.insert(0, _p)
|
||||
|
||||
from import_common import ( # noqa: E402
|
||||
DEFAULT_SCHEMA_PATH,
|
||||
chunk_key_for,
|
||||
excerpt_matches,
|
||||
excerpt_score,
|
||||
find_chunk_text,
|
||||
iter_extraction_files,
|
||||
load_schema,
|
||||
validate_extraction,
|
||||
)
|
||||
|
||||
SUBAGENT_PROMPT = SCRIPT_DIR / "SUBAGENT_PROMPT.md"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# re-extraction prompt
|
||||
# --------------------------------------------------------------------------
|
||||
def build_reextraction_prompt(
|
||||
chunk_key: str, chunk_file: Optional[str], errors: list[str]
|
||||
) -> str:
|
||||
"""The exact prompt to hand a subagent to re-extract a rejected chunk."""
|
||||
chunk_ref = chunk_file or f"data/chunks/<source_id>/{chunk_key}.txt"
|
||||
lines = [
|
||||
f"# RE-EXTRACTION — chunk `{chunk_key}`",
|
||||
"",
|
||||
"The previous extraction for this chunk was **REJECTED**. Reasons:",
|
||||
"",
|
||||
]
|
||||
lines += [f"- {e}" for e in errors]
|
||||
lines += [
|
||||
"",
|
||||
"## What to do",
|
||||
"",
|
||||
f"1. Read ONLY this chunk: `{chunk_ref}`",
|
||||
f"2. Follow the extraction rules in `{SUBAGENT_PROMPT.relative_to(REPO_ROOT)}`.",
|
||||
"3. Fix every problem listed above. In particular:",
|
||||
" - every `source_excerpt` must be copied **verbatim** from the chunk",
|
||||
" (it is checked as a fuzzy substring — invented quotes are rejected);",
|
||||
" - `source_excerpt` and `page_reference` are mandatory on every activity;",
|
||||
" - the output must validate against `scripts/activity_schema.json`.",
|
||||
f"4. Overwrite the extraction file `data/extracted/{chunk_key}.json`.",
|
||||
"",
|
||||
]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# manifest
|
||||
# --------------------------------------------------------------------------
|
||||
def load_manifest(manifest_path: Path) -> dict:
|
||||
if manifest_path.is_file():
|
||||
try:
|
||||
data = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
data.setdefault("chunks", {})
|
||||
return data
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
return {"chunks": {}}
|
||||
|
||||
|
||||
def save_manifest(manifest: dict, manifest_path: Path) -> None:
|
||||
manifest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
manifest_path.write_text(
|
||||
json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8"
|
||||
)
|
||||
|
||||
|
||||
def mark_rejected(manifest: dict, chunk_key: str) -> None:
|
||||
"""Flip a chunk to `rejected` in the manifest (creating the entry if new)."""
|
||||
entry = manifest["chunks"].get(chunk_key, {})
|
||||
entry["state"] = "rejected"
|
||||
manifest["chunks"][chunk_key] = entry
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# validation
|
||||
# --------------------------------------------------------------------------
|
||||
def validate_file(json_path: Path, schema: dict, chunks_dir: Path) -> list[str]:
|
||||
"""Return the list of errors for one extraction file (empty == valid)."""
|
||||
try:
|
||||
data = json.loads(json_path.read_text(encoding="utf-8"))
|
||||
except json.JSONDecodeError as exc:
|
||||
return [f"invalid JSON: {exc}"]
|
||||
|
||||
errors = validate_extraction(data, schema)
|
||||
if errors:
|
||||
return errors
|
||||
|
||||
header = data.get("header", {})
|
||||
chunk_text = find_chunk_text(json_path, header, chunks_dir)
|
||||
if chunk_text is None:
|
||||
return [f"source chunk not found for {chunk_key_for(json_path, header)}"]
|
||||
|
||||
for adict in data.get("activities", []):
|
||||
excerpt = adict.get("source_excerpt") or ""
|
||||
if not excerpt_matches(excerpt, chunk_text):
|
||||
score = excerpt_score(excerpt, chunk_text)
|
||||
errors.append(
|
||||
f"activity {adict.get('name')!r}: source_excerpt not found in "
|
||||
f"chunk (best match {score:.0f}/100) — possible hallucination"
|
||||
)
|
||||
return errors
|
||||
|
||||
|
||||
def run(
|
||||
extracted_dir: Path,
|
||||
chunks_dir: Path,
|
||||
manifest_path: Path,
|
||||
schema_path: Path = DEFAULT_SCHEMA_PATH,
|
||||
) -> dict:
|
||||
schema = load_schema(schema_path)
|
||||
manifest = load_manifest(manifest_path)
|
||||
reextract_dir = extracted_dir / "_reextract"
|
||||
|
||||
report = {"total": 0, "valid": 0, "rejected": 0, "rejected_chunks": []}
|
||||
for json_path in iter_extraction_files(extracted_dir):
|
||||
report["total"] += 1
|
||||
errors = validate_file(json_path, schema, chunks_dir)
|
||||
if not errors:
|
||||
report["valid"] += 1
|
||||
continue
|
||||
|
||||
report["rejected"] += 1
|
||||
try:
|
||||
data = json.loads(json_path.read_text(encoding="utf-8"))
|
||||
header = data.get("header", {})
|
||||
except json.JSONDecodeError:
|
||||
header = {}
|
||||
chunk_key = chunk_key_for(json_path, header)
|
||||
chunk_file = None
|
||||
meta = manifest["chunks"].get(chunk_key)
|
||||
if meta:
|
||||
chunk_file = meta.get("chunk_file")
|
||||
|
||||
reextract_dir.mkdir(parents=True, exist_ok=True)
|
||||
prompt = build_reextraction_prompt(chunk_key, chunk_file, errors)
|
||||
(reextract_dir / f"{chunk_key}.prompt.md").write_text(prompt, encoding="utf-8")
|
||||
|
||||
mark_rejected(manifest, chunk_key)
|
||||
report["rejected_chunks"].append({"chunk": chunk_key, "errors": errors})
|
||||
|
||||
save_manifest(manifest, manifest_path)
|
||||
return report
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# CLI
|
||||
# --------------------------------------------------------------------------
|
||||
def main(argv: Optional[list[str]] = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Validate extraction JSON files.")
|
||||
parser.add_argument("--extracted", default="data/extracted")
|
||||
parser.add_argument("--chunks", default="data/chunks")
|
||||
parser.add_argument("--manifest", default="data/chunks/manifest.json")
|
||||
parser.add_argument("--schema", default=str(DEFAULT_SCHEMA_PATH))
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
report = run(
|
||||
Path(args.extracted), Path(args.chunks), Path(args.manifest), Path(args.schema)
|
||||
)
|
||||
print(f"extraction files : {report['total']}")
|
||||
print(f" valid : {report['valid']}")
|
||||
print(f" rejected : {report['rejected']}")
|
||||
for item in report["rejected_chunks"]:
|
||||
print(f" [rejected] {item['chunk']}")
|
||||
for err in item["errors"]:
|
||||
print(f" - {err}")
|
||||
if report["rejected"]:
|
||||
print(f"\nRe-extraction prompts written to {args.extracted}/_reextract/")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
114
tests/conftest.py
Normal file
114
tests/conftest.py
Normal file
@@ -0,0 +1,114 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Shared pytest fixtures for the extraction-pipeline tests.
|
||||
|
||||
scripts/ is not a package, so it is added to sys.path here. Synthetic fixtures
|
||||
(PDF, docx, zip, HTML) are generated at runtime — no binary blobs in the repo.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
SCRIPTS_DIR = REPO_ROOT / "scripts"
|
||||
if str(SCRIPTS_DIR) not in sys.path:
|
||||
sys.path.insert(0, str(SCRIPTS_DIR))
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# synthetic PDF — deliberately large to pin the "no max_pages" regression
|
||||
# --------------------------------------------------------------------------
|
||||
@pytest.fixture
|
||||
def big_pdf(tmp_path):
|
||||
"""A 60-page PDF; each page carries a unique 'PDFMARK-<n>' token."""
|
||||
from reportlab.pdfgen import canvas
|
||||
from reportlab.lib.pagesizes import letter
|
||||
|
||||
path = tmp_path / "big.pdf"
|
||||
c = canvas.Canvas(str(path), pagesize=letter)
|
||||
for n in range(1, 61):
|
||||
c.drawString(72, 720, f"PDFMARK-{n} synthetic activity page number {n}")
|
||||
c.drawString(72, 700, "Acest joc educativ se joaca in echipa.")
|
||||
c.showPage()
|
||||
c.save()
|
||||
return path
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# synthetic docx — 100 paragraphs => 3 synthetic pages at 40 paras/page
|
||||
# --------------------------------------------------------------------------
|
||||
@pytest.fixture
|
||||
def sample_docx(tmp_path):
|
||||
import docx
|
||||
|
||||
path = tmp_path / "sample.docx"
|
||||
document = docx.Document()
|
||||
for i in range(100):
|
||||
document.add_paragraph(f"Paragraf {i}: continut joc team-building.")
|
||||
document.save(str(path))
|
||||
return path
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# synthetic HTML mirror page — with nav/script/footer chrome to strip
|
||||
# --------------------------------------------------------------------------
|
||||
HTML_WITH_NAV = """<!doctype html>
|
||||
<html><head><title>Joc</title>
|
||||
<style>.x{color:red}</style>
|
||||
<script>var tracking = 1;</script>
|
||||
</head><body>
|
||||
<nav><a href="/">Home</a><a href="/games">Games</a></nav>
|
||||
<header>Site Banner Junk</header>
|
||||
<main>
|
||||
<h1>Vanatoarea de comori</h1>
|
||||
<p>Acesta este un joc real de orientare pentru cercetasi.</p>
|
||||
<p>Jucatorii cauta indicii ascunse in tabara.</p>
|
||||
</main>
|
||||
<footer>Copyright 2024 - toate drepturile rezervate</footer>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def html_with_nav(tmp_path):
|
||||
path = tmp_path / "page.html"
|
||||
path.write_text(HTML_WITH_NAV, encoding="utf-8")
|
||||
return path
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# synthetic zip — contains a docx and a stray junk file
|
||||
# --------------------------------------------------------------------------
|
||||
@pytest.fixture
|
||||
def sample_zip(tmp_path, sample_docx):
|
||||
path = tmp_path / "archive.zip"
|
||||
with zipfile.ZipFile(path, "w") as zf:
|
||||
zf.write(sample_docx, arcname="inner/sample.docx")
|
||||
zf.writestr("desktop.ini", "junk")
|
||||
return path
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# synthetic normalized source — paginated, with an activity straddling a
|
||||
# page boundary so the chunker overlap can be verified.
|
||||
# --------------------------------------------------------------------------
|
||||
@pytest.fixture
|
||||
def paginated_source(tmp_path):
|
||||
"""A 50-page normalized source. An activity spans the page 20/21 boundary."""
|
||||
lines = ["SOURCE: synthetic/test.pdf", "CONVERTED: 2026-05-19",
|
||||
"FORMAT: pdf", "=" * 50, ""]
|
||||
for n in range(1, 51):
|
||||
lines.append(f"--- PAGE {n} ---")
|
||||
if n == 20:
|
||||
lines.append("ACTIVITY-START jocul podului care traverseaza pagina")
|
||||
elif n == 21:
|
||||
lines.append("continuare a jocului podului ACTIVITY-END")
|
||||
else:
|
||||
lines.append(f"continut obisnuit pe pagina {n}")
|
||||
lines.append("")
|
||||
path = tmp_path / "src_paginated.txt"
|
||||
path.write_text("\n".join(lines), encoding="utf-8")
|
||||
return path
|
||||
3
tests/fixtures/.gitkeep
vendored
Normal file
3
tests/fixtures/.gitkeep
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
# Test fixtures (synthetic PDF/docx/zip/HTML) are generated at runtime by
|
||||
# tests/conftest.py — no binary blobs are committed. This file only preserves
|
||||
# the directory in git.
|
||||
334
tests/test_build_database.py
Normal file
334
tests/test_build_database.py
Normal file
@@ -0,0 +1,334 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for scripts/build_database.py — the import / dedup / swap side.
|
||||
|
||||
Covers: category -> slug + `altele` fallback; dedup across all three threshold
|
||||
bands; EN != RO never merged; field combination on merge; atomic swap with a
|
||||
simulated mid-build crash; the source_excerpt substring check.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
SCRIPTS_DIR = REPO_ROOT / "scripts"
|
||||
for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
|
||||
if _p not in sys.path:
|
||||
sys.path.insert(0, _p)
|
||||
|
||||
import build_database as bd # noqa: E402
|
||||
from app.models.activity import Activity # noqa: E402
|
||||
from app.models.database import DatabaseManager # noqa: E402
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# helpers
|
||||
# --------------------------------------------------------------------------
|
||||
def _activity(**over):
|
||||
base = dict(
|
||||
name="Jocul testului",
|
||||
description="O activitate de echipa in aer liber.",
|
||||
category="team-building",
|
||||
content_type="joc",
|
||||
language="ro",
|
||||
extraction_confidence="high",
|
||||
)
|
||||
base.update(over)
|
||||
return Activity(**base)
|
||||
|
||||
|
||||
def _ext_activity(**over):
|
||||
"""A schema-valid extraction-JSON activity object."""
|
||||
base = dict(
|
||||
name="Jocul testului",
|
||||
description="O activitate de echipa in aer liber.",
|
||||
category="team-building",
|
||||
content_type="joc",
|
||||
language="ro",
|
||||
extraction_confidence="high",
|
||||
source_excerpt="ANCHOR-EXCERPT despre jocul testului",
|
||||
page_reference="page 1",
|
||||
)
|
||||
base.update(over)
|
||||
return base
|
||||
|
||||
|
||||
def _write_extraction(extracted_dir, chunk_key, activities, source_id="src01"):
|
||||
extracted_dir.mkdir(parents=True, exist_ok=True)
|
||||
payload = {
|
||||
"header": {
|
||||
"source_hash": "hash1234deadbeef",
|
||||
"schema_version": "1.0",
|
||||
"prompt_version": "1.0",
|
||||
"chunk_range": "pages 1-20",
|
||||
"source_id": source_id,
|
||||
"chunk_key": chunk_key,
|
||||
},
|
||||
"activities": activities,
|
||||
}
|
||||
(extracted_dir / f"{chunk_key}.json").write_text(
|
||||
json.dumps(payload, ensure_ascii=False), encoding="utf-8"
|
||||
)
|
||||
|
||||
|
||||
def _write_chunk(chunks_dir, source_id, chunk_key, text):
|
||||
d = chunks_dir / source_id
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
(d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# step 3 — category normalization
|
||||
# --------------------------------------------------------------------------
|
||||
def test_category_alias_mapped_to_slug():
|
||||
act = bd.dict_to_activity(_ext_activity(category="teambuilding"), "s.txt")
|
||||
assert act.category == "team-building"
|
||||
|
||||
|
||||
def test_unknown_category_falls_back_to_altele():
|
||||
act = bd.dict_to_activity(_ext_activity(category="zzz-not-a-category"), "s.txt")
|
||||
assert act.category == "altele"
|
||||
|
||||
|
||||
def test_content_type_normalized():
|
||||
act = bd.dict_to_activity(_ext_activity(content_type="games"), "s.txt")
|
||||
assert act.content_type == "joc"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# step 4 — dedup, three bands
|
||||
# --------------------------------------------------------------------------
|
||||
def test_dedup_auto_merge_identical_descriptions():
|
||||
""">= 85 similar -> a single merged row."""
|
||||
a = _activity(description="copiii formeaza echipe si traverseaza terenul")
|
||||
b = _activity(description="copiii formeaza echipe si traverseaza terenul")
|
||||
out, stats = bd.dedup_activities([a, b])
|
||||
assert len(out) == 1
|
||||
assert stats["auto_merged"] == 1
|
||||
assert out[0].needs_review == 0
|
||||
|
||||
|
||||
def test_dedup_borderline_keeps_both_and_flags_needs_review():
|
||||
"""60-85 similar -> both kept, both flagged needs_review."""
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
d1 = "alpha beta gamma delta epsilon"
|
||||
d2 = "alpha beta gamma delta epsilon zeta eta theta iota"
|
||||
score = fuzz.token_sort_ratio(d1, d2)
|
||||
assert 60.0 <= score < 85.0, f"precondition: score={score} not borderline"
|
||||
|
||||
a = _activity(description=d1)
|
||||
b = _activity(description=d2)
|
||||
out, stats = bd.dedup_activities([a, b])
|
||||
assert len(out) == 2
|
||||
assert stats["borderline"] == 2
|
||||
assert all(act.needs_review == 1 for act in out)
|
||||
|
||||
|
||||
def test_dedup_low_similarity_kept_as_separate_variants():
|
||||
"""< 60 similar -> separate variants, no needs_review."""
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
d1 = "alpha beta gamma delta epsilon"
|
||||
d2 = "quebec romeo sierra tango uniform victor whiskey"
|
||||
assert fuzz.token_sort_ratio(d1, d2) < 60.0
|
||||
|
||||
a = _activity(description=d1)
|
||||
b = _activity(description=d2)
|
||||
out, stats = bd.dedup_activities([a, b])
|
||||
assert len(out) == 2
|
||||
assert stats["auto_merged"] == 0
|
||||
assert all(act.needs_review == 0 for act in out)
|
||||
|
||||
|
||||
def test_dedup_never_merges_across_languages():
|
||||
"""Same name + same description but EN vs RO -> two distinct rows."""
|
||||
desc = "children form teams and cross the field"
|
||||
ro = _activity(name="Cursa", description=desc, language="ro")
|
||||
en = _activity(name="Cursa", description=desc, language="en")
|
||||
out, stats = bd.dedup_activities([ro, en])
|
||||
assert len(out) == 2
|
||||
assert stats["auto_merged"] == 0
|
||||
langs = {a.language for a in out}
|
||||
assert langs == {"ro", "en"}
|
||||
|
||||
|
||||
def test_merge_combines_fields():
|
||||
"""On merge: longest description/rules, union materials, accumulated sources."""
|
||||
desc = "copiii formeaza echipe si traverseaza terenul cu obstacole"
|
||||
a = _activity(
|
||||
description=desc,
|
||||
rules="regula scurta",
|
||||
materials_list="franghie, esarfa",
|
||||
source_file="a.txt",
|
||||
keywords="echipa",
|
||||
)
|
||||
b = _activity(
|
||||
description=desc,
|
||||
rules="o regula mult mai lunga si mai detaliata pentru joc",
|
||||
materials_list="busola, esarfa",
|
||||
source_file="b.txt",
|
||||
keywords="cooperare",
|
||||
)
|
||||
out, _ = bd.dedup_activities([a, b])
|
||||
assert len(out) == 1
|
||||
merged = out[0]
|
||||
assert merged.rules == "o regula mult mai lunga si mai detaliata pentru joc"
|
||||
mats = set(m.strip() for m in merged.materials_list.split(","))
|
||||
assert mats == {"franghie", "esarfa", "busola"}
|
||||
assert set(merged.source_files) == {"a.txt", "b.txt"}
|
||||
assert merged.popularity_score == 1
|
||||
assert set(k.strip() for k in merged.keywords.split(",")) == {"echipa", "cooperare"}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# step 5 — review decisions
|
||||
# --------------------------------------------------------------------------
|
||||
def test_review_decision_drop_removes_row():
|
||||
from import_common import content_key, normalize_name
|
||||
|
||||
a = _activity(description="o descriere de test")
|
||||
key = content_key(normalize_name(a.name), a.language, a.description)
|
||||
kept, stats = bd.apply_review_decisions([a], {key: {"decision": "drop"}})
|
||||
assert kept == []
|
||||
assert stats["dropped"] == 1
|
||||
|
||||
|
||||
def test_review_decision_keep_separate_clears_needs_review():
|
||||
from import_common import content_key, normalize_name
|
||||
|
||||
a = _activity(description="o descriere de test")
|
||||
a.needs_review = 1
|
||||
key = content_key(normalize_name(a.name), a.language, a.description)
|
||||
kept, stats = bd.apply_review_decisions([a], {key: {"decision": "keep-separate"}})
|
||||
assert len(kept) == 1 and kept[0].needs_review == 0
|
||||
assert stats["resolved"] == 1
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# step 2b — source_excerpt hallucination check
|
||||
# --------------------------------------------------------------------------
|
||||
def test_hallucinated_excerpt_activity_dropped(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
sources = tmp_path / "sources"
|
||||
|
||||
good = _ext_activity(
|
||||
name="Joc real", source_excerpt="textul real apare in bucata sursa"
|
||||
)
|
||||
bad = _ext_activity(
|
||||
name="Joc inventat",
|
||||
source_excerpt="acest citat nu exista nicaieri in sursa originala xyzzy",
|
||||
)
|
||||
_write_extraction(extracted, "src01.part01", [good, bad])
|
||||
_write_chunk(
|
||||
chunks, "src01", "src01.part01",
|
||||
"--- PAGE 1 ---\ntextul real apare in bucata sursa pentru jocul real.\n",
|
||||
)
|
||||
|
||||
from import_common import load_schema
|
||||
|
||||
schema = load_schema()
|
||||
res = bd.collect_activities(extracted, chunks, sources, schema)
|
||||
names = {a.name for a in res["activities"]}
|
||||
assert names == {"Joc real"}
|
||||
assert res["activities_hallucinated"] == 1
|
||||
assert (extracted / "_rejected").exists()
|
||||
|
||||
|
||||
def test_schema_invalid_file_moved_to_rejected(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
sources = tmp_path / "sources"
|
||||
extracted.mkdir(parents=True)
|
||||
|
||||
# missing required header keys + bad activity
|
||||
(extracted / "bad.json").write_text(
|
||||
json.dumps({"header": {}, "activities": [{"name": "x"}]}),
|
||||
encoding="utf-8",
|
||||
)
|
||||
from import_common import load_schema
|
||||
|
||||
res = bd.collect_activities(extracted, chunks, sources, load_schema())
|
||||
assert res["files_rejected_schema"] == 1
|
||||
assert not (extracted / "bad.json").exists()
|
||||
assert (extracted / "_rejected" / "bad.json").exists()
|
||||
assert (extracted / "_rejected" / "bad.errors.txt").exists()
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# end-to-end rebuild + atomic swap
|
||||
# --------------------------------------------------------------------------
|
||||
def _setup_corpus(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
sources = tmp_path / "sources"
|
||||
excerpt = "jocul testului este o activitate de echipa"
|
||||
_write_extraction(
|
||||
extracted, "src01.part01",
|
||||
[_ext_activity(source_excerpt=excerpt)],
|
||||
)
|
||||
_write_chunk(chunks, "src01", "src01.part01",
|
||||
f"--- PAGE 1 ---\n{excerpt} in aer liber.\n")
|
||||
return extracted, chunks, sources
|
||||
|
||||
|
||||
def test_rebuild_creates_database(tmp_path):
|
||||
extracted, chunks, sources = _setup_corpus(tmp_path)
|
||||
db_path = tmp_path / "activities.db"
|
||||
|
||||
report = bd.rebuild(
|
||||
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
|
||||
db_path=db_path,
|
||||
)
|
||||
assert db_path.exists()
|
||||
assert report["final_count"] == 1
|
||||
|
||||
db = DatabaseManager(str(db_path))
|
||||
rows = db.search_activities()
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["category"] == "team-building"
|
||||
|
||||
|
||||
def test_atomic_swap_keeps_live_db_intact_on_crash(tmp_path, monkeypatch):
|
||||
"""A mid-build crash must leave the live DB byte-identical."""
|
||||
extracted, chunks, sources = _setup_corpus(tmp_path)
|
||||
db_path = tmp_path / "activities.db"
|
||||
|
||||
# a pre-existing live DB with sentinel content
|
||||
live = DatabaseManager(str(db_path))
|
||||
live.insert_activity(_activity(name="Sentinel viu"))
|
||||
before = db_path.read_bytes()
|
||||
|
||||
def boom(self, *a, **k):
|
||||
raise RuntimeError("simulated mid-build crash")
|
||||
|
||||
monkeypatch.setattr(DatabaseManager, "bulk_insert_activities", boom)
|
||||
|
||||
with pytest.raises(RuntimeError, match="simulated mid-build crash"):
|
||||
bd.rebuild(
|
||||
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
|
||||
db_path=db_path,
|
||||
)
|
||||
|
||||
# live DB untouched, tmp cleaned up
|
||||
assert db_path.read_bytes() == before
|
||||
assert not (tmp_path / "activities.db.tmp").exists()
|
||||
|
||||
|
||||
def test_rebuild_backs_up_live_db(tmp_path):
|
||||
extracted, chunks, sources = _setup_corpus(tmp_path)
|
||||
db_path = tmp_path / "activities.db"
|
||||
DatabaseManager(str(db_path)).insert_activity(_activity(name="Vechi"))
|
||||
|
||||
report = bd.rebuild(
|
||||
extracted_dir=extracted, chunks_dir=chunks, sources_dir=sources,
|
||||
db_path=db_path,
|
||||
)
|
||||
assert report["backup"] is not None
|
||||
assert Path(report["backup"]).exists()
|
||||
assert os.path.basename(report["backup"]) == "activities.db.bak"
|
||||
183
tests/test_chunk_sources.py
Normal file
183
tests/test_chunk_sources.py
Normal file
@@ -0,0 +1,183 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Tests for scripts/chunk_sources.py."""
|
||||
|
||||
import json
|
||||
|
||||
import chunk_sources as cs
|
||||
import normalize_sources as ns
|
||||
|
||||
|
||||
def _pages(n):
|
||||
return [(i, f"text-{i}") for i in range(1, n + 1)]
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# header parsing
|
||||
# --------------------------------------------------------------------------
|
||||
def test_parse_source_splits_header_and_body(paginated_source):
|
||||
text = paginated_source.read_text(encoding="utf-8")
|
||||
header, body = cs.parse_source(text)
|
||||
assert header["FORMAT"] == "pdf"
|
||||
assert body.lstrip().startswith("--- PAGE 1 ---")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# page chunking
|
||||
# --------------------------------------------------------------------------
|
||||
def test_chunk_pages_basic_split():
|
||||
chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
|
||||
# stride 16: starts at pages 1, 17, 33, ...
|
||||
assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 20
|
||||
assert chunks[1]["page_start"] == 17
|
||||
assert chunks[-1]["page_end"] == 50
|
||||
|
||||
|
||||
def test_chunk_pages_have_overlap():
|
||||
chunks = cs.chunk_pages(_pages(50), pages_per_chunk=20, overlap=4)
|
||||
overlap = chunks[0]["page_end"] - chunks[1]["page_start"] + 1
|
||||
assert overlap == 4
|
||||
|
||||
|
||||
def test_chunk_pages_short_document_single_chunk():
|
||||
chunks = cs.chunk_pages(_pages(8), pages_per_chunk=20, overlap=4)
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0]["page_start"] == 1 and chunks[0]["page_end"] == 8
|
||||
|
||||
|
||||
def test_chunk_pages_empty():
|
||||
assert cs.chunk_pages([]) == []
|
||||
|
||||
|
||||
def test_activity_at_page_boundary_intact_in_one_chunk(paginated_source):
|
||||
"""An activity straddling the page 20/21 boundary must appear whole in >=1 chunk."""
|
||||
text = paginated_source.read_text(encoding="utf-8")
|
||||
chunks = cs.make_chunks(text)
|
||||
full = [
|
||||
c for c in chunks
|
||||
if "ACTIVITY-START" in c["text"] and "ACTIVITY-END" in c["text"]
|
||||
]
|
||||
assert full, "activity spanning a page boundary was split across all chunks"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# word-window chunking for unpaginated text
|
||||
# --------------------------------------------------------------------------
|
||||
def test_chunk_words_window_and_overlap():
|
||||
text = " ".join(f"w{i}" for i in range(25_000))
|
||||
chunks = cs.chunk_words(text, window=10_000, overlap=2_000)
|
||||
assert len(chunks) == 3 # stride 8000 over 25000 words
|
||||
first = chunks[0]["text"].split()
|
||||
second = chunks[1]["text"].split()
|
||||
assert first[8_000:10_000] == second[0:2_000] # 2000-word overlap
|
||||
|
||||
|
||||
def test_make_chunks_unpaginated_uses_word_windows():
|
||||
body = "cuvant " * 15_000
|
||||
text = "SOURCE: x\nFORMAT: txt\n" + "=" * 50 + "\n\n" + body
|
||||
chunks = cs.make_chunks(text)
|
||||
assert len(chunks) >= 2
|
||||
assert chunks[0]["chunk_range"].startswith("words")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# stable source ids — anti-collision
|
||||
# --------------------------------------------------------------------------
|
||||
def test_stable_id_same_stem_different_path_no_collision():
|
||||
a = ns.stable_id("camp/games/scout.pdf")
|
||||
b = ns.stable_id("school/lessons/scout.pdf")
|
||||
assert a != b
|
||||
assert a.endswith("_scout") and b.endswith("_scout")
|
||||
|
||||
|
||||
def test_stable_id_deterministic():
|
||||
assert ns.stable_id("a/b/c.pdf") == ns.stable_id("a/b/c.pdf")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# manifest registry + idempotency
|
||||
# --------------------------------------------------------------------------
|
||||
def test_run_writes_chunks_and_manifest(paginated_source, tmp_path):
|
||||
sources_dir = tmp_path / "sources"
|
||||
sources_dir.mkdir()
|
||||
(sources_dir / paginated_source.name).write_text(
|
||||
paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
|
||||
)
|
||||
chunks_dir = tmp_path / "chunks"
|
||||
|
||||
summary = cs.run(sources_dir, chunks_dir)
|
||||
assert summary["sources"] == 1
|
||||
assert summary["chunks"] >= 2
|
||||
|
||||
manifest = json.loads((chunks_dir / "manifest.json").read_text())
|
||||
assert manifest["chunks"]
|
||||
for key, meta in manifest["chunks"].items():
|
||||
assert meta["state"] == "pending"
|
||||
assert meta["expected_json"] == f"{key}.json"
|
||||
assert (chunks_dir.parent / meta["chunk_file"]).exists()
|
||||
|
||||
|
||||
def test_manifest_idempotent_preserves_state(paginated_source, tmp_path):
|
||||
sources_dir = tmp_path / "sources"
|
||||
sources_dir.mkdir()
|
||||
(sources_dir / paginated_source.name).write_text(
|
||||
paginated_source.read_text(encoding="utf-8"), encoding="utf-8"
|
||||
)
|
||||
chunks_dir = tmp_path / "chunks"
|
||||
manifest_path = chunks_dir / "manifest.json"
|
||||
|
||||
cs.run(sources_dir, chunks_dir)
|
||||
|
||||
# orchestrator marks one chunk done
|
||||
manifest = json.loads(manifest_path.read_text())
|
||||
first_key = next(iter(manifest["chunks"]))
|
||||
n_before = len(manifest["chunks"])
|
||||
manifest["chunks"][first_key]["state"] = "done"
|
||||
manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
|
||||
|
||||
# re-run: 'done' must survive, no chunk added or lost
|
||||
cs.run(sources_dir, chunks_dir)
|
||||
manifest2 = json.loads(manifest_path.read_text())
|
||||
assert len(manifest2["chunks"]) == n_before
|
||||
assert manifest2["chunks"][first_key]["state"] == "done"
|
||||
assert all(
|
||||
m["state"] in ("pending", "done") for m in manifest2["chunks"].values()
|
||||
)
|
||||
|
||||
|
||||
def test_manifest_resets_state_when_source_changes(paginated_source, tmp_path):
|
||||
sources_dir = tmp_path / "sources"
|
||||
sources_dir.mkdir()
|
||||
src = sources_dir / paginated_source.name
|
||||
src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
|
||||
chunks_dir = tmp_path / "chunks"
|
||||
manifest_path = chunks_dir / "manifest.json"
|
||||
|
||||
cs.run(sources_dir, chunks_dir)
|
||||
manifest = json.loads(manifest_path.read_text())
|
||||
first_key = next(iter(manifest["chunks"]))
|
||||
manifest["chunks"][first_key]["state"] = "done"
|
||||
manifest_path.write_text(json.dumps(manifest), encoding="utf-8")
|
||||
|
||||
# mutate the source content -> hash changes -> state resets
|
||||
src.write_text(src.read_text(encoding="utf-8") + "\n--- PAGE 51 ---\nextra\n",
|
||||
encoding="utf-8")
|
||||
cs.run(sources_dir, chunks_dir)
|
||||
manifest2 = json.loads(manifest_path.read_text())
|
||||
assert manifest2["chunks"][first_key]["state"] == "pending"
|
||||
|
||||
|
||||
def test_prune_stale_removes_orphan_entries(paginated_source, tmp_path):
|
||||
sources_dir = tmp_path / "sources"
|
||||
sources_dir.mkdir()
|
||||
src = sources_dir / paginated_source.name
|
||||
src.write_text(paginated_source.read_text(encoding="utf-8"), encoding="utf-8")
|
||||
chunks_dir = tmp_path / "chunks"
|
||||
|
||||
cs.run(sources_dir, chunks_dir)
|
||||
# delete the source -> its chunks become stale
|
||||
src.unlink()
|
||||
summary = cs.run(sources_dir, chunks_dir)
|
||||
assert summary["chunks"] == 0
|
||||
assert summary["pruned"] >= 1
|
||||
manifest = json.loads((chunks_dir / "manifest.json").read_text())
|
||||
assert manifest["chunks"] == {}
|
||||
177
tests/test_extract_common.py
Normal file
177
tests/test_extract_common.py
Normal file
@@ -0,0 +1,177 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Tests for scripts/extract_common.py."""
|
||||
|
||||
import shutil
|
||||
import zipfile
|
||||
|
||||
import pytest
|
||||
|
||||
import extract_common as ec
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# format detection
|
||||
# --------------------------------------------------------------------------
|
||||
def test_detect_format():
|
||||
assert ec.detect_format("a/b/file.PDF") == "pdf"
|
||||
assert ec.detect_format("x.docx") == "docx"
|
||||
assert ec.detect_format("x.doc") == "doc"
|
||||
assert ec.detect_format("x.pptx") == "pptx"
|
||||
assert ec.detect_format("x.html") == "html"
|
||||
assert ec.detect_format("x.zip") == "zip"
|
||||
assert ec.detect_format("x.epub") == "epub"
|
||||
assert ec.detect_format("x.xyz") == "unknown"
|
||||
|
||||
|
||||
def test_is_junk():
|
||||
assert ec.is_junk("some/desktop.ini")
|
||||
assert ec.is_junk("notes.bak")
|
||||
assert ec.is_junk("README.md")
|
||||
assert not ec.is_junk("1000 Scout Games.pdf")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# PDF — the critical "no max_pages" regression
|
||||
# --------------------------------------------------------------------------
|
||||
def test_pdf_extracts_all_60_pages(big_pdf):
|
||||
body = ec.extract_pdf(big_pdf)
|
||||
# the old converter capped at 50 pages — page 60 must be present now
|
||||
assert "--- PAGE 60 ---" in body
|
||||
assert "PDFMARK-60" in body
|
||||
assert ec.count_page_markers(body) == 60
|
||||
|
||||
|
||||
def test_pdf_does_not_truncate_mid_document(big_pdf):
|
||||
body = ec.extract_pdf(big_pdf)
|
||||
pages = ec.split_pages(body)
|
||||
assert pages[-1][0] == 60 # last marker is the real last page
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# page join / split round-trip
|
||||
# --------------------------------------------------------------------------
|
||||
def test_join_split_round_trip():
|
||||
body = ec.join_pages(["alpha", "beta", "gamma"])
|
||||
pages = ec.split_pages(body)
|
||||
assert [n for n, _ in pages] == [1, 2, 3]
|
||||
assert [t for _, t in pages] == ["alpha", "beta", "gamma"]
|
||||
|
||||
|
||||
def test_split_pages_no_markers_returns_empty():
|
||||
assert ec.split_pages("plain text with no markers") == []
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# docx — synthetic page markers
|
||||
# --------------------------------------------------------------------------
|
||||
def test_docx_synthetic_page_markers(sample_docx):
|
||||
body = ec.extract_docx(sample_docx)
|
||||
# 100 paragraphs / 40 per page => 3 pages
|
||||
assert ec.count_page_markers(body) == 3
|
||||
assert "Paragraf 99" in body
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# HTML mirror — nav/script/footer stripped
|
||||
# --------------------------------------------------------------------------
|
||||
def test_html_strips_chrome(html_with_nav):
|
||||
body = ec.extract_html(html_with_nav)
|
||||
assert "Vanatoarea de comori" in body
|
||||
assert "joc real de orientare" in body
|
||||
# chrome must be gone
|
||||
assert "tracking" not in body
|
||||
assert "Site Banner Junk" not in body
|
||||
assert "toate drepturile rezervate" not in body
|
||||
assert "Games" not in body
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# content hash + near-duplicate elimination
|
||||
# --------------------------------------------------------------------------
|
||||
def test_content_hash_ignores_whitespace():
|
||||
assert ec.content_hash("hello world") == ec.content_hash("hello world\n")
|
||||
assert ec.content_hash("hello world") != ec.content_hash("goodbye world")
|
||||
|
||||
|
||||
def test_dedupe_exact_duplicates():
|
||||
items = [("a", "joc identic"), ("b", "joc identic"), ("c", "alt joc")]
|
||||
kept = ec.dedupe_texts(items)
|
||||
assert [k for k, _ in kept] == ["a", "c"]
|
||||
|
||||
|
||||
def test_dedupe_near_duplicates():
|
||||
base = "Vanatoarea de comori este un joc de orientare pentru cercetasi in tabara."
|
||||
near = base + " Pagina printata." # >95% similar
|
||||
items = [("orig", base), ("print", near), ("other", "Cu totul alt continut diferit aici.")]
|
||||
kept = ec.dedupe_texts(items, threshold=85.0)
|
||||
keys = [k for k, _ in kept]
|
||||
assert "orig" in keys
|
||||
assert "print" not in keys
|
||||
assert "other" in keys
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# zip recursion
|
||||
# --------------------------------------------------------------------------
|
||||
def test_zip_recurses_into_inner_files(sample_zip):
|
||||
body = ec.extract_zip(sample_zip)
|
||||
assert "Paragraf 0" in body
|
||||
assert ec.count_page_markers(body) > 0
|
||||
|
||||
|
||||
def test_zip_bad_archive_returns_empty(tmp_path):
|
||||
bad = tmp_path / "broken.zip"
|
||||
bad.write_text("not a zip", encoding="utf-8")
|
||||
assert ec.extract_zip(bad) == ""
|
||||
|
||||
|
||||
def test_nested_zip(tmp_path, sample_zip):
|
||||
outer = tmp_path / "outer.zip"
|
||||
with zipfile.ZipFile(outer, "w") as zf:
|
||||
zf.write(sample_zip, arcname="nested/archive.zip")
|
||||
body = ec.extract_zip(outer)
|
||||
assert "Paragraf 0" in body
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# preflight
|
||||
# --------------------------------------------------------------------------
|
||||
def test_preflight_python_packages_present():
|
||||
report = ec.preflight()
|
||||
# all required packages are installed in the test environment
|
||||
assert report["missing_python"] == []
|
||||
|
||||
|
||||
def test_preflight_reports_libreoffice_state():
|
||||
report = ec.preflight()
|
||||
has_lo = bool(shutil.which("libreoffice") or shutil.which("soffice"))
|
||||
if has_lo:
|
||||
assert all("libreoffice" not in w for w in report["warnings"])
|
||||
else:
|
||||
assert any("libreoffice" in w for w in report["warnings"])
|
||||
|
||||
|
||||
def test_preflight_ocr_flag():
|
||||
report = ec.preflight(check_ocr=True)
|
||||
if not shutil.which("tesseract"):
|
||||
assert any("tesseract" in m for m in report["missing_system"])
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# legacy .doc — skipped unless libreoffice is installed
|
||||
# --------------------------------------------------------------------------
|
||||
@pytest.mark.skipif(
|
||||
not (shutil.which("libreoffice") or shutil.which("soffice")),
|
||||
reason="libreoffice not installed",
|
||||
)
|
||||
def test_doc_conversion(tmp_path, sample_docx):
|
||||
doc_path = tmp_path / "legacy.doc"
|
||||
shutil.copy(sample_docx, doc_path) # smoke test of the docx path
|
||||
body = ec.extract_doc(doc_path)
|
||||
assert ec.count_page_markers(body) >= 1
|
||||
|
||||
|
||||
def test_doc_without_libreoffice_raises(tmp_path, monkeypatch):
|
||||
monkeypatch.setattr(ec.shutil, "which", lambda _: None)
|
||||
with pytest.raises(RuntimeError):
|
||||
ec.extract_doc(tmp_path / "whatever.doc")
|
||||
139
tests/test_fts.py
Normal file
139
tests/test_fts.py
Normal file
@@ -0,0 +1,139 @@
|
||||
"""
|
||||
Integration tests for the FTS5 search index.
|
||||
|
||||
Confirms that materials_list and skills_developed are indexed by FTS5 and kept
|
||||
in sync by the insert / update / delete triggers (plan §6, §7).
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
# Make the project root importable when pytest is run from anywhere.
|
||||
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
if PROJECT_ROOT not in sys.path:
|
||||
sys.path.insert(0, PROJECT_ROOT)
|
||||
|
||||
from app.models.activity import Activity # noqa: E402
|
||||
from app.models.database import DatabaseManager # noqa: E402
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def db(tmp_path):
|
||||
"""A fresh DatabaseManager backed by a temporary SQLite file."""
|
||||
return DatabaseManager(str(tmp_path / "test_activities.db"))
|
||||
|
||||
|
||||
def _make_activity(**overrides):
|
||||
base = dict(
|
||||
name="Vânătoarea de comori",
|
||||
description="O activitate de echipă în aer liber.",
|
||||
category="camp-outdoor",
|
||||
content_type="joc",
|
||||
source_file="test.txt",
|
||||
language="ro",
|
||||
)
|
||||
base.update(overrides)
|
||||
return Activity(**base)
|
||||
|
||||
|
||||
def test_search_by_materials_list(db):
|
||||
"""A term that only appears in materials_list returns the activity."""
|
||||
activity = _make_activity(materials_list="frânghie, eșarfă, busolă")
|
||||
db.insert_activity(activity)
|
||||
|
||||
results = db.search_activities(search_text="busolă")
|
||||
assert len(results) == 1
|
||||
assert results[0]["name"] == "Vânătoarea de comori"
|
||||
|
||||
|
||||
def test_search_by_skills_developed(db):
|
||||
"""A term that only appears in skills_developed returns the activity."""
|
||||
activity = _make_activity(skills_developed="comunicare, leadership, rabdare")
|
||||
db.insert_activity(activity)
|
||||
|
||||
results = db.search_activities(search_text="leadership")
|
||||
assert len(results) == 1
|
||||
assert results[0]["name"] == "Vânătoarea de comori"
|
||||
|
||||
|
||||
def test_term_absent_from_indexed_columns_no_hit(db):
|
||||
"""A term present in no indexed column yields no hit (control)."""
|
||||
db.insert_activity(_make_activity(materials_list="frânghie"))
|
||||
assert db.search_activities(search_text="zzzunlikelyterm") == []
|
||||
|
||||
|
||||
def test_delete_trigger_removes_from_fts(db):
|
||||
"""Deleting an activity removes it from the FTS index (delete trigger)."""
|
||||
activity = _make_activity(materials_list="catalige")
|
||||
activity_id = db.insert_activity(activity)
|
||||
assert len(db.search_activities(search_text="catalige")) == 1
|
||||
|
||||
with db._get_connection() as conn:
|
||||
conn.execute("DELETE FROM activities WHERE id = ?", (activity_id,))
|
||||
conn.commit()
|
||||
|
||||
assert db.search_activities(search_text="catalige") == []
|
||||
|
||||
|
||||
def test_update_trigger_resyncs_fts(db):
|
||||
"""Updating materials_list re-syncs the FTS index (update trigger)."""
|
||||
activity = _make_activity(materials_list="creioane")
|
||||
activity_id = db.insert_activity(activity)
|
||||
assert len(db.search_activities(search_text="creioane")) == 1
|
||||
|
||||
with db._get_connection() as conn:
|
||||
conn.execute(
|
||||
"UPDATE activities SET materials_list = ? WHERE id = ?",
|
||||
("acuarele", activity_id),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# Old term gone, new term found.
|
||||
assert db.search_activities(search_text="creioane") == []
|
||||
assert len(db.search_activities(search_text="acuarele")) == 1
|
||||
|
||||
|
||||
def test_rebuild_fts_index(db):
|
||||
"""rebuild_fts_index keeps materials_list / skills_developed searchable."""
|
||||
db.insert_activity(_make_activity(skills_developed="orientare"))
|
||||
db.rebuild_fts_index()
|
||||
assert len(db.search_activities(search_text="orientare")) == 1
|
||||
|
||||
|
||||
def test_new_schema_columns_round_trip(db):
|
||||
"""New activity columns persist and load back via from_dict."""
|
||||
activity = _make_activity(
|
||||
source_files=["a.txt", "b.txt"],
|
||||
source_excerpt="Citat scurt din sursă.",
|
||||
extraction_confidence="high",
|
||||
needs_review=1,
|
||||
normalized_name="vanatoarea de comori",
|
||||
)
|
||||
activity_id = db.insert_activity(activity)
|
||||
|
||||
row = db.get_activity_by_id(activity_id)
|
||||
assert row["content_type"] == "joc"
|
||||
assert row["language"] == "ro"
|
||||
assert row["extraction_confidence"] == "high"
|
||||
assert row["needs_review"] == 1
|
||||
assert row["normalized_name"] == "vanatoarea de comori"
|
||||
assert json.loads(row["source_files"]) == ["a.txt", "b.txt"]
|
||||
assert row["source_excerpt"] == "Citat scurt din sursă."
|
||||
|
||||
loaded = Activity.from_dict(row)
|
||||
assert loaded.source_files == ["a.txt", "b.txt"]
|
||||
assert loaded.content_type == "joc"
|
||||
|
||||
|
||||
def test_normalized_name_auto_derived(db):
|
||||
"""normalized_name is auto-derived from name when not provided."""
|
||||
activity = Activity(
|
||||
name="Ștafetă cu Obstacole",
|
||||
description="desc",
|
||||
category="sports-active",
|
||||
source_file="t.txt",
|
||||
)
|
||||
assert activity.normalized_name == "stafeta cu obstacole"
|
||||
140
tests/test_search.py
Normal file
140
tests/test_search.py
Normal file
@@ -0,0 +1,140 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
CRITICAL REGRESSION TEST (plan §6, §7).
|
||||
|
||||
`search.py` changed the result sets of /search and /api/search: the default
|
||||
search now EXCLUDES the non-game content types (rețetă / cântec / ceremonie),
|
||||
which surface only when the user explicitly filters that content_type or picks
|
||||
a non-game category. This test guards that behaviour.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from app.models.activity import Activity
|
||||
from app.models.database import DatabaseManager
|
||||
from app.services.search import SearchService
|
||||
from app.config_taxonomy import NON_GAME_CONTENT_TYPES
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# fixtures
|
||||
# --------------------------------------------------------------------------
|
||||
def _activity(name, content_type, category="altele", language="ro"):
|
||||
return Activity(
|
||||
name=name,
|
||||
description=f"Descriere pentru {name}, un conținut de tip {content_type}.",
|
||||
category=category,
|
||||
content_type=content_type,
|
||||
language=language,
|
||||
source_file="test/fixture.txt",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def search_service(tmp_path):
|
||||
"""A SearchService over a temp DB seeded with one row per content_type."""
|
||||
db = DatabaseManager(str(tmp_path / "activities.db"))
|
||||
db.clear_database()
|
||||
db.bulk_insert_activities([
|
||||
_activity("Vanatoarea de comori", "joc", category="wide-games"),
|
||||
_activity("Cercul de cunoastere", "activitate", category="icebreakers"),
|
||||
_activity("Reteta de paine la ceaun", "reteta", category="retete"),
|
||||
_activity("Cantecul de tabara", "cantec", category="cantece-ceremonii"),
|
||||
_activity("Ceremonia de inchidere", "ceremonie", category="cantece-ceremonii"),
|
||||
_activity("Game in English", "joc", category="wide-games", language="en"),
|
||||
])
|
||||
return SearchService(db)
|
||||
|
||||
|
||||
def _content_types(results):
|
||||
return {r.get("content_type") for r in results}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# the regression: default search excludes non-game content types
|
||||
# --------------------------------------------------------------------------
|
||||
def test_default_search_excludes_non_game_content(search_service):
|
||||
"""No filters → rețete / cântece / ceremonii must NOT appear."""
|
||||
results = search_service.search_activities()
|
||||
types = _content_types(results)
|
||||
|
||||
assert types, "default search returned nothing"
|
||||
for non_game in NON_GAME_CONTENT_TYPES:
|
||||
assert non_game not in types, (
|
||||
f"default search leaked non-game content_type '{non_game}'"
|
||||
)
|
||||
# game content is still present
|
||||
assert "joc" in types
|
||||
assert "activitate" in types
|
||||
|
||||
|
||||
def test_default_search_with_text_excludes_non_game(search_service):
|
||||
"""A text query still excludes non-game content by default."""
|
||||
results = search_service.search_activities(search_text="conținut")
|
||||
assert NON_GAME_CONTENT_TYPES[0] not in _content_types(results)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# explicit content_type filter INCLUDES the non-game rows
|
||||
# --------------------------------------------------------------------------
|
||||
def test_explicit_content_type_filter_includes_non_game(search_service):
|
||||
"""Filtering content_type=reteta returns exactly the rețete."""
|
||||
results = search_service.search_activities(filters={"content_type": "reteta"})
|
||||
types = _content_types(results)
|
||||
|
||||
assert types == {"reteta"}, f"expected only rețete, got {types}"
|
||||
assert len(results) == 1
|
||||
|
||||
|
||||
def test_explicit_content_type_filter_for_cantec(search_service):
|
||||
results = search_service.search_activities(filters={"content_type": "cantec"})
|
||||
assert _content_types(results) == {"cantec"}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# a non-game CATEGORY filter also lifts the exclusion
|
||||
# --------------------------------------------------------------------------
|
||||
def test_non_game_category_filter_includes_non_game(search_service):
|
||||
"""Picking category=cantece-ceremonii surfaces cântece + ceremonii."""
|
||||
results = search_service.search_activities(
|
||||
filters={"category": "cantece-ceremonii"})
|
||||
types = _content_types(results)
|
||||
|
||||
assert "cantec" in types
|
||||
assert "ceremonie" in types
|
||||
|
||||
|
||||
def test_game_category_filter_still_excludes_non_game(search_service):
|
||||
"""A normal (game) category filter keeps the non-game exclusion."""
|
||||
results = search_service.search_activities(filters={"category": "wide-games"})
|
||||
types = _content_types(results)
|
||||
for non_game in NON_GAME_CONTENT_TYPES:
|
||||
assert non_game not in types
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# language filter
|
||||
# --------------------------------------------------------------------------
|
||||
def test_language_filter_ro(search_service):
|
||||
results = search_service.search_activities(filters={"language": "ro"})
|
||||
assert results
|
||||
assert all(r.get("language") == "ro" for r in results)
|
||||
|
||||
|
||||
def test_language_filter_en(search_service):
|
||||
results = search_service.search_activities(filters={"language": "en"})
|
||||
assert results
|
||||
assert all(r.get("language") == "en" for r in results)
|
||||
assert {r.get("name") for r in results} == {"Game in English"}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# get_filter_options surfaces the new axes
|
||||
# --------------------------------------------------------------------------
|
||||
def test_filter_options_include_content_type_and_language(search_service):
|
||||
"""The dynamic-filter mechanism now exposes content_type + language."""
|
||||
options = search_service.db.get_filter_options()
|
||||
assert "content_type" in options
|
||||
assert "language" in options
|
||||
assert "joc" in options["content_type"]
|
||||
assert set(options["language"]) == {"ro", "en"}
|
||||
156
tests/test_validate_extractions.py
Normal file
156
tests/test_validate_extractions.py
Normal file
@@ -0,0 +1,156 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Tests for scripts/validate_extractions.py.
|
||||
|
||||
Covers: schema rejection, the source_excerpt hallucination check, the content
|
||||
of the generated re-extraction prompt, and the manifest `rejected` marking.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
SCRIPTS_DIR = REPO_ROOT / "scripts"
|
||||
for _p in (str(REPO_ROOT), str(SCRIPTS_DIR)):
|
||||
if _p not in sys.path:
|
||||
sys.path.insert(0, _p)
|
||||
|
||||
import validate_extractions as ve # noqa: E402
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# helpers
|
||||
# --------------------------------------------------------------------------
|
||||
def _ext_activity(**over):
|
||||
base = dict(
|
||||
name="Jocul testului",
|
||||
description="O activitate de echipa in aer liber.",
|
||||
category="team-building",
|
||||
content_type="joc",
|
||||
language="ro",
|
||||
extraction_confidence="high",
|
||||
source_excerpt="ancora din bucata sursa",
|
||||
page_reference="page 1",
|
||||
)
|
||||
base.update(over)
|
||||
return base
|
||||
|
||||
|
||||
def _write_extraction(extracted_dir, chunk_key, activities, header_extra=None):
|
||||
extracted_dir.mkdir(parents=True, exist_ok=True)
|
||||
header = {
|
||||
"source_hash": "hash1234deadbeef",
|
||||
"schema_version": "1.0",
|
||||
"prompt_version": "1.0",
|
||||
"chunk_range": "pages 1-20",
|
||||
"source_id": "src01",
|
||||
"chunk_key": chunk_key,
|
||||
}
|
||||
if header_extra:
|
||||
header.update(header_extra)
|
||||
payload = {"header": header, "activities": activities}
|
||||
(extracted_dir / f"{chunk_key}.json").write_text(
|
||||
json.dumps(payload, ensure_ascii=False), encoding="utf-8"
|
||||
)
|
||||
|
||||
|
||||
def _write_chunk(chunks_dir, source_id, chunk_key, text):
|
||||
d = chunks_dir / source_id
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
(d / f"{chunk_key}.txt").write_text(text, encoding="utf-8")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# tests
|
||||
# --------------------------------------------------------------------------
|
||||
def test_valid_file_passes(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
excerpt = "ancora din bucata sursa apare aici"
|
||||
_write_extraction(extracted, "src01.part01", [_ext_activity(source_excerpt=excerpt)])
|
||||
_write_chunk(chunks, "src01", "src01.part01", f"--- PAGE 1 ---\n{excerpt}\n")
|
||||
|
||||
report = ve.run(extracted, chunks, tmp_path / "manifest.json")
|
||||
assert report["valid"] == 1
|
||||
assert report["rejected"] == 0
|
||||
|
||||
|
||||
def test_schema_invalid_file_rejected(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
extracted.mkdir(parents=True)
|
||||
(extracted / "src01.part01.json").write_text(
|
||||
json.dumps({"header": {}, "activities": [{"name": "x"}]}), encoding="utf-8"
|
||||
)
|
||||
|
||||
report = ve.run(extracted, chunks, tmp_path / "manifest.json")
|
||||
assert report["rejected"] == 1
|
||||
prompt = extracted / "_reextract" / "src01.part01.prompt.md"
|
||||
assert prompt.exists()
|
||||
|
||||
|
||||
def test_hallucinated_excerpt_rejected(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
_write_extraction(
|
||||
extracted, "src01.part01",
|
||||
[_ext_activity(source_excerpt="citat complet inventat care nu exista qqqq")],
|
||||
)
|
||||
_write_chunk(chunks, "src01", "src01.part01",
|
||||
"--- PAGE 1 ---\ntext complet diferit despre altceva.\n")
|
||||
|
||||
report = ve.run(extracted, chunks, tmp_path / "manifest.json")
|
||||
assert report["rejected"] == 1
|
||||
errors = report["rejected_chunks"][0]["errors"]
|
||||
assert any("hallucination" in e for e in errors)
|
||||
|
||||
|
||||
def test_reextraction_prompt_content(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
_write_extraction(
|
||||
extracted, "src01.part01",
|
||||
[_ext_activity(source_excerpt="citat inventat care nu exista zzzz")],
|
||||
)
|
||||
_write_chunk(chunks, "src01", "src01.part01",
|
||||
"--- PAGE 1 ---\ntext despre cu totul altceva aici.\n")
|
||||
|
||||
ve.run(extracted, chunks, tmp_path / "manifest.json")
|
||||
prompt = (extracted / "_reextract" / "src01.part01.prompt.md").read_text(
|
||||
encoding="utf-8"
|
||||
)
|
||||
assert "src01.part01" in prompt
|
||||
assert "REJECTED" in prompt
|
||||
assert "verbatim" in prompt
|
||||
assert "data/extracted/src01.part01.json" in prompt
|
||||
|
||||
|
||||
def test_manifest_marks_chunk_rejected(tmp_path):
|
||||
extracted = tmp_path / "extracted"
|
||||
chunks = tmp_path / "chunks"
|
||||
manifest_path = tmp_path / "manifest.json"
|
||||
manifest_path.write_text(
|
||||
json.dumps({"chunks": {"src01.part01": {"state": "done",
|
||||
"chunk_file": "chunks/src01/src01.part01.txt"}}}),
|
||||
encoding="utf-8",
|
||||
)
|
||||
_write_extraction(
|
||||
extracted, "src01.part01",
|
||||
[_ext_activity(source_excerpt="citat fabricat absent vvvv")],
|
||||
)
|
||||
_write_chunk(chunks, "src01", "src01.part01",
|
||||
"--- PAGE 1 ---\nun continut neinrudit.\n")
|
||||
|
||||
ve.run(extracted, chunks, manifest_path)
|
||||
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
assert manifest["chunks"]["src01.part01"]["state"] == "rejected"
|
||||
|
||||
|
||||
def test_build_reextraction_prompt_lists_errors():
|
||||
prompt = ve.build_reextraction_prompt(
|
||||
"abc.part03", "data/chunks/abc/abc.part03.txt",
|
||||
["header: 'source_hash' is a required property"],
|
||||
)
|
||||
assert "abc.part03" in prompt
|
||||
assert "source_hash" in prompt
|
||||
Reference in New Issue
Block a user