fix(import): NFKD normalization for non-Romanian diacritics

clean_web_text used a hard-coded Romanian-only translation map, so Hungarian (BALÁZS LORÁNT), German, Czech, Polish names passed through unchanged into SQLite and Oracle ROA. Replace with unicodedata.normalize('NFKD') + combining mark strip — covers RO/HU/DE/CZ/PL/FR/ES universally. Romanian cedilla legacy forms (ş/ţ/Ş/Ţ) remain handled (NFKD decomposes to base + combining cedilla). Stroke letters not decomposed by NFKD (ß, ł, đ, ø, æ, œ) covered via _NFKD_OVERRIDES translation map. sync_service._addr_match.norm migrated off the removed _DIACRITICS constant to clean_web_text; address matching now also handles non-RO diacritics. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 22:52:50 +00:00
parent 9b62b2b457
commit 956667086d
3 changed files with 64 additions and 20 deletions
--- a/api/app/services/import_service.py
+++ b/api/app/services/import_service.py
@@ -2,38 +2,39 @@ import html
 import json
 import logging
 import re
+import unicodedata
 import oracledb
 from datetime import datetime, timedelta
 from .. import database

 logger = logging.getLogger(__name__)

-# Diacritics to ASCII mapping (Romanian)
-_DIACRITICS = str.maketrans({
-    '\u0103': 'a',  # ă
-    '\u00e2': 'a',  # â
-    '\u00ee': 'i',  # î
-    '\u0219': 's',  # ș
-    '\u021b': 't',  # ț
-    '\u0102': 'A',  # Ă
-    '\u00c2': 'A',  # Â
-    '\u00ce': 'I',  # Î
-    '\u0218': 'S',  # Ș
-    '\u021a': 'T',  # Ț
-    # Older Unicode variants
-    '\u015f': 's',  # ş (cedilla)
-    '\u0163': 't',  # ţ (cedilla)
-    '\u015e': 'S',  # Ş
-    '\u0162': 'T',  # Ţ
+# Stroke/ligature letters NFKD does not decompose (structural mod, not a
+# combining mark). Everything else — RO cedilla ş/ţ, RO comma-below ș/ț,
+# HU ő/ű, DE umlaut, CZ háček, FR accent, ES tilde — is handled
+# universally by unicodedata.normalize('NFKD') + Mn-category strip below.
+_NFKD_OVERRIDES = str.maketrans({
+    'ß': 'ss',                              # ß
+    'æ': 'ae', 'Æ': 'AE',          # æ Æ
+    'œ': 'oe', 'Œ': 'OE',          # œ Œ
+    'ł': 'l',  'Ł': 'L',           # ł Ł (Polish)
+    'đ': 'd',  'Đ': 'D',           # đ Đ (Croatian)
+    'ø': 'o',  'Ø': 'O',           # ø Ø (Danish/Norwegian)
 })


 def clean_web_text(text: str) -> str:
-    """Port of VFP CleanWebText: unescape HTML entities + diacritics to ASCII."""
+    """Port of VFP CleanWebText: unescape HTML entities + strip diacritics to ASCII.
+
+    NFKD decomposition + combining-mark filter covers RO/HU/DE/CZ/PL/FR/ES in
+    one pass; _NFKD_OVERRIDES handles stroke letters NFKD leaves alone.
+    """
    if not text:
        return ""
    result = html.unescape(text)
-    result = result.translate(_DIACRITICS)
+    result = result.translate(_NFKD_OVERRIDES)
+    decomposed = unicodedata.normalize('NFKD', result)
+    result = ''.join(ch for ch in decomposed if not unicodedata.combining(ch))
    # Remove any remaining <br> tags
    for br in ('<br>', '<br/>', '<br />'):
        result = result.replace(br, ' ')
--- a/api/app/services/sync_service.py
+++ b/api/app/services/sync_service.py
@@ -48,7 +48,7 @@ def _addr_match(gomag_json, roa_json):
        r'ET|ETAJ|COM|COMUNA|SAT|MUN|MUNICIPIUL|JUD|JUDETUL|CARTIER|PARTER|SECTOR|SECTORUL|ORAS)(?:\b|(?=\d))'
    )
    def norm(s):
-        s = (s or '').translate(import_service._DIACRITICS).upper()
+        s = import_service.clean_web_text(s or '').upper()
        s = _ADDR_WORDS.sub('', s)
        return re.sub(r'[^A-Z0-9]', '', s)
    def _soundex(s):
--- a/api/tests/test_business_rules.py
+++ b/api/tests/test_business_rules.py
@@ -821,6 +821,49 @@ class TestFormatAddressForOracle:
        assert result == "JUD:Bacau;Zemes;Str Principala Modarzau Blocuri"


+class TestCleanWebTextDiacritics:
+    """clean_web_text strips diacritics across RO/HU/DE/CZ/PL via NFKD."""
+
+    def test_hungarian_acute(self):
+        from app.services.import_service import clean_web_text
+        assert clean_web_text("BALÁZS LORÁNT") == "BALAZS LORANT"
+
+    def test_hungarian_double_acute(self):
+        from app.services.import_service import clean_web_text
+        assert clean_web_text("Lőrincz Ödön") == "Lorincz Odon"
+        assert clean_web_text("Erdős Pál") == "Erdos Pal"
+
+    def test_romanian_comma_below_modern(self):
+        from app.services.import_service import clean_web_text
+        assert clean_web_text("Ștefan Țîrcă") == "Stefan Tirca"
+        assert clean_web_text("ȘTEFAN ȚÎRCĂ") == "STEFAN TIRCA"
+
+    def test_romanian_cedilla_legacy_preserved(self):
+        """Cedilla ş/ţ/Ş/Ţ must still normalize to s/t/S/T (regression guard)."""
+        from app.services.import_service import clean_web_text
+        assert clean_web_text("şcoala") == "scoala"
+        assert clean_web_text("ţara") == "tara"
+        assert clean_web_text("ŞTEFAN ŢARA") == "STEFAN TARA"
+        assert clean_web_text("IAŞI") == "IASI"
+
+    def test_german_umlaut_and_eszett(self):
+        from app.services.import_service import clean_web_text
+        assert clean_web_text("Müller Straße") == "Muller Strasse"
+
+    def test_czech_polish(self):
+        from app.services.import_service import clean_web_text
+        assert clean_web_text("Dvořák") == "Dvorak"
+        assert clean_web_text("Łódź") == "Lodz"
+
+    def test_html_entity_unescape(self):
+        from app.services.import_service import clean_web_text
+        assert clean_web_text("Caf&eacute;") == "Cafe"
+
+    def test_empty_input(self):
+        from app.services.import_service import clean_web_text
+        assert clean_web_text("") == ""
+
+
 # ===========================================================================
 # Group 11: TestRefreshOrderAddress
 # ===========================================================================