fix(import): NFKD normalization for non-Romanian diacritics
Some checks failed
Tests / fast-tests (push) Has been cancelled
Tests / full-tests (push) Has been cancelled

clean_web_text used a hard-coded Romanian-only translation map, so Hungarian
(BALÁZS LORÁNT), German, Czech, Polish names passed through unchanged into
SQLite and Oracle ROA. Replace with unicodedata.normalize('NFKD') + combining
mark strip — covers RO/HU/DE/CZ/PL/FR/ES universally. Romanian cedilla legacy
forms (ş/ţ/Ş/Ţ) remain handled (NFKD decomposes to base + combining cedilla).
Stroke letters not decomposed by NFKD (ß, ł, đ, ø, æ, œ) covered via
_NFKD_OVERRIDES translation map.

sync_service._addr_match.norm migrated off the removed _DIACRITICS constant
to clean_web_text; address matching now also handles non-RO diacritics.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Claude Agent
2026-05-05 22:52:50 +00:00
parent 9b62b2b457
commit 956667086d
3 changed files with 64 additions and 20 deletions

View File

@@ -2,38 +2,39 @@ import html
import json import json
import logging import logging
import re import re
import unicodedata
import oracledb import oracledb
from datetime import datetime, timedelta from datetime import datetime, timedelta
from .. import database from .. import database
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Diacritics to ASCII mapping (Romanian) # Stroke/ligature letters NFKD does not decompose (structural mod, not a
_DIACRITICS = str.maketrans({ # combining mark). Everything else — RO cedilla ş/ţ, RO comma-below ș/ț,
'\u0103': 'a', # ă # HU ő/ű, DE umlaut, CZ háček, FR accent, ES tilde — is handled
'\u00e2': 'a', # â # universally by unicodedata.normalize('NFKD') + Mn-category strip below.
'\u00ee': 'i', # î _NFKD_OVERRIDES = str.maketrans({
'\u0219': 's', # ș 'ß': 'ss', # ß
'\u021b': 't', # ț 'æ': 'ae', 'Æ': 'AE', # æ Æ
'\u0102': 'A', # Ă 'œ': 'oe', 'Œ': 'OE', # œ Œ
'\u00c2': 'A', # Â 'ł': 'l', 'Ł': 'L', # ł Ł (Polish)
'\u00ce': 'I', # Î 'đ': 'd', 'Đ': 'D', # đ Đ (Croatian)
'\u0218': 'S', # Ș 'ø': 'o', 'Ø': 'O', # ø Ø (Danish/Norwegian)
'\u021a': 'T', # Ț
# Older Unicode variants
'\u015f': 's', # ş (cedilla)
'\u0163': 't', # ţ (cedilla)
'\u015e': 'S', # Ş
'\u0162': 'T', # Ţ
}) })
def clean_web_text(text: str) -> str: def clean_web_text(text: str) -> str:
"""Port of VFP CleanWebText: unescape HTML entities + diacritics to ASCII.""" """Port of VFP CleanWebText: unescape HTML entities + strip diacritics to ASCII.
NFKD decomposition + combining-mark filter covers RO/HU/DE/CZ/PL/FR/ES in
one pass; _NFKD_OVERRIDES handles stroke letters NFKD leaves alone.
"""
if not text: if not text:
return "" return ""
result = html.unescape(text) result = html.unescape(text)
result = result.translate(_DIACRITICS) result = result.translate(_NFKD_OVERRIDES)
decomposed = unicodedata.normalize('NFKD', result)
result = ''.join(ch for ch in decomposed if not unicodedata.combining(ch))
# Remove any remaining <br> tags # Remove any remaining <br> tags
for br in ('<br>', '<br/>', '<br />'): for br in ('<br>', '<br/>', '<br />'):
result = result.replace(br, ' ') result = result.replace(br, ' ')

View File

@@ -48,7 +48,7 @@ def _addr_match(gomag_json, roa_json):
r'ET|ETAJ|COM|COMUNA|SAT|MUN|MUNICIPIUL|JUD|JUDETUL|CARTIER|PARTER|SECTOR|SECTORUL|ORAS)(?:\b|(?=\d))' r'ET|ETAJ|COM|COMUNA|SAT|MUN|MUNICIPIUL|JUD|JUDETUL|CARTIER|PARTER|SECTOR|SECTORUL|ORAS)(?:\b|(?=\d))'
) )
def norm(s): def norm(s):
s = (s or '').translate(import_service._DIACRITICS).upper() s = import_service.clean_web_text(s or '').upper()
s = _ADDR_WORDS.sub('', s) s = _ADDR_WORDS.sub('', s)
return re.sub(r'[^A-Z0-9]', '', s) return re.sub(r'[^A-Z0-9]', '', s)
def _soundex(s): def _soundex(s):

View File

@@ -821,6 +821,49 @@ class TestFormatAddressForOracle:
assert result == "JUD:Bacau;Zemes;Str Principala Modarzau Blocuri" assert result == "JUD:Bacau;Zemes;Str Principala Modarzau Blocuri"
class TestCleanWebTextDiacritics:
"""clean_web_text strips diacritics across RO/HU/DE/CZ/PL via NFKD."""
def test_hungarian_acute(self):
from app.services.import_service import clean_web_text
assert clean_web_text("BALÁZS LORÁNT") == "BALAZS LORANT"
def test_hungarian_double_acute(self):
from app.services.import_service import clean_web_text
assert clean_web_text("Lőrincz Ödön") == "Lorincz Odon"
assert clean_web_text("Erdős Pál") == "Erdos Pal"
def test_romanian_comma_below_modern(self):
from app.services.import_service import clean_web_text
assert clean_web_text("Ștefan Țîrcă") == "Stefan Tirca"
assert clean_web_text("ȘTEFAN ȚÎRCĂ") == "STEFAN TIRCA"
def test_romanian_cedilla_legacy_preserved(self):
"""Cedilla ş/ţ/Ş/Ţ must still normalize to s/t/S/T (regression guard)."""
from app.services.import_service import clean_web_text
assert clean_web_text("şcoala") == "scoala"
assert clean_web_text("ţara") == "tara"
assert clean_web_text("ŞTEFAN ŢARA") == "STEFAN TARA"
assert clean_web_text("IAŞI") == "IASI"
def test_german_umlaut_and_eszett(self):
from app.services.import_service import clean_web_text
assert clean_web_text("Müller Straße") == "Muller Strasse"
def test_czech_polish(self):
from app.services.import_service import clean_web_text
assert clean_web_text("Dvořák") == "Dvorak"
assert clean_web_text("Łódź") == "Lodz"
def test_html_entity_unescape(self):
from app.services.import_service import clean_web_text
assert clean_web_text("Caf&eacute;") == "Cafe"
def test_empty_input(self):
from app.services.import_service import clean_web_text
assert clean_web_text("") == ""
# =========================================================================== # ===========================================================================
# Group 11: TestRefreshOrderAddress # Group 11: TestRefreshOrderAddress
# =========================================================================== # ===========================================================================