diff --git a/api/app/services/import_service.py b/api/app/services/import_service.py index 6cff22a..7e57522 100644 --- a/api/app/services/import_service.py +++ b/api/app/services/import_service.py @@ -2,38 +2,39 @@ import html import json import logging import re +import unicodedata import oracledb from datetime import datetime, timedelta from .. import database logger = logging.getLogger(__name__) -# Diacritics to ASCII mapping (Romanian) -_DIACRITICS = str.maketrans({ - '\u0103': 'a', # ă - '\u00e2': 'a', # â - '\u00ee': 'i', # î - '\u0219': 's', # ș - '\u021b': 't', # ț - '\u0102': 'A', # Ă - '\u00c2': 'A', #  - '\u00ce': 'I', # Î - '\u0218': 'S', # Ș - '\u021a': 'T', # Ț - # Older Unicode variants - '\u015f': 's', # ş (cedilla) - '\u0163': 't', # ţ (cedilla) - '\u015e': 'S', # Ş - '\u0162': 'T', # Ţ +# Stroke/ligature letters NFKD does not decompose (structural mod, not a +# combining mark). Everything else — RO cedilla ş/ţ, RO comma-below ș/ț, +# HU ő/ű, DE umlaut, CZ háček, FR accent, ES tilde — is handled +# universally by unicodedata.normalize('NFKD') + Mn-category strip below. +_NFKD_OVERRIDES = str.maketrans({ + 'ß': 'ss', # ß + 'æ': 'ae', 'Æ': 'AE', # æ Æ + 'œ': 'oe', 'Œ': 'OE', # œ Œ + 'ł': 'l', 'Ł': 'L', # ł Ł (Polish) + 'đ': 'd', 'Đ': 'D', # đ Đ (Croatian) + 'ø': 'o', 'Ø': 'O', # ø Ø (Danish/Norwegian) }) def clean_web_text(text: str) -> str: - """Port of VFP CleanWebText: unescape HTML entities + diacritics to ASCII.""" + """Port of VFP CleanWebText: unescape HTML entities + strip diacritics to ASCII. + + NFKD decomposition + combining-mark filter covers RO/HU/DE/CZ/PL/FR/ES in + one pass; _NFKD_OVERRIDES handles stroke letters NFKD leaves alone. + """ if not text: return "" result = html.unescape(text) - result = result.translate(_DIACRITICS) + result = result.translate(_NFKD_OVERRIDES) + decomposed = unicodedata.normalize('NFKD', result) + result = ''.join(ch for ch in decomposed if not unicodedata.combining(ch)) # Remove any remaining
tags for br in ('
', '
', '
'): result = result.replace(br, ' ') diff --git a/api/app/services/sync_service.py b/api/app/services/sync_service.py index a5498ab..d419b0a 100644 --- a/api/app/services/sync_service.py +++ b/api/app/services/sync_service.py @@ -48,7 +48,7 @@ def _addr_match(gomag_json, roa_json): r'ET|ETAJ|COM|COMUNA|SAT|MUN|MUNICIPIUL|JUD|JUDETUL|CARTIER|PARTER|SECTOR|SECTORUL|ORAS)(?:\b|(?=\d))' ) def norm(s): - s = (s or '').translate(import_service._DIACRITICS).upper() + s = import_service.clean_web_text(s or '').upper() s = _ADDR_WORDS.sub('', s) return re.sub(r'[^A-Z0-9]', '', s) def _soundex(s): diff --git a/api/tests/test_business_rules.py b/api/tests/test_business_rules.py index 5cf35b0..e1f9b76 100644 --- a/api/tests/test_business_rules.py +++ b/api/tests/test_business_rules.py @@ -821,6 +821,49 @@ class TestFormatAddressForOracle: assert result == "JUD:Bacau;Zemes;Str Principala Modarzau Blocuri" +class TestCleanWebTextDiacritics: + """clean_web_text strips diacritics across RO/HU/DE/CZ/PL via NFKD.""" + + def test_hungarian_acute(self): + from app.services.import_service import clean_web_text + assert clean_web_text("BALÁZS LORÁNT") == "BALAZS LORANT" + + def test_hungarian_double_acute(self): + from app.services.import_service import clean_web_text + assert clean_web_text("Lőrincz Ödön") == "Lorincz Odon" + assert clean_web_text("Erdős Pál") == "Erdos Pal" + + def test_romanian_comma_below_modern(self): + from app.services.import_service import clean_web_text + assert clean_web_text("Ștefan Țîrcă") == "Stefan Tirca" + assert clean_web_text("ȘTEFAN ȚÎRCĂ") == "STEFAN TIRCA" + + def test_romanian_cedilla_legacy_preserved(self): + """Cedilla ş/ţ/Ş/Ţ must still normalize to s/t/S/T (regression guard).""" + from app.services.import_service import clean_web_text + assert clean_web_text("şcoala") == "scoala" + assert clean_web_text("ţara") == "tara" + assert clean_web_text("ŞTEFAN ŢARA") == "STEFAN TARA" + assert clean_web_text("IAŞI") == "IASI" + + def test_german_umlaut_and_eszett(self): + from app.services.import_service import clean_web_text + assert clean_web_text("Müller Straße") == "Muller Strasse" + + def test_czech_polish(self): + from app.services.import_service import clean_web_text + assert clean_web_text("Dvořák") == "Dvorak" + assert clean_web_text("Łódź") == "Lodz" + + def test_html_entity_unescape(self): + from app.services.import_service import clean_web_text + assert clean_web_text("Café") == "Cafe" + + def test_empty_input(self): + from app.services.import_service import clean_web_text + assert clean_web_text("") == "" + + # =========================================================================== # Group 11: TestRefreshOrderAddress # ===========================================================================