fix(import): NFKD normalization for non-Romanian diacritics
clean_web_text used a hard-coded Romanian-only translation map, so Hungarian
(BALÁZS LORÁNT), German, Czech, Polish names passed through unchanged into
SQLite and Oracle ROA. Replace with unicodedata.normalize('NFKD') + combining
mark strip — covers RO/HU/DE/CZ/PL/FR/ES universally. Romanian cedilla legacy
forms (ş/ţ/Ş/Ţ) remain handled (NFKD decomposes to base + combining cedilla).
Stroke letters not decomposed by NFKD (ß, ł, đ, ø, æ, œ) covered via
_NFKD_OVERRIDES translation map.
sync_service._addr_match.norm migrated off the removed _DIACRITICS constant
to clean_web_text; address matching now also handles non-RO diacritics.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,38 +2,39 @@ import html
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
import unicodedata
|
||||||
import oracledb
|
import oracledb
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from .. import database
|
from .. import database
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Diacritics to ASCII mapping (Romanian)
|
# Stroke/ligature letters NFKD does not decompose (structural mod, not a
|
||||||
_DIACRITICS = str.maketrans({
|
# combining mark). Everything else — RO cedilla ş/ţ, RO comma-below ș/ț,
|
||||||
'\u0103': 'a', # ă
|
# HU ő/ű, DE umlaut, CZ háček, FR accent, ES tilde — is handled
|
||||||
'\u00e2': 'a', # â
|
# universally by unicodedata.normalize('NFKD') + Mn-category strip below.
|
||||||
'\u00ee': 'i', # î
|
_NFKD_OVERRIDES = str.maketrans({
|
||||||
'\u0219': 's', # ș
|
'ß': 'ss', # ß
|
||||||
'\u021b': 't', # ț
|
'æ': 'ae', 'Æ': 'AE', # æ Æ
|
||||||
'\u0102': 'A', # Ă
|
'œ': 'oe', 'Œ': 'OE', # œ Œ
|
||||||
'\u00c2': 'A', # Â
|
'ł': 'l', 'Ł': 'L', # ł Ł (Polish)
|
||||||
'\u00ce': 'I', # Î
|
'đ': 'd', 'Đ': 'D', # đ Đ (Croatian)
|
||||||
'\u0218': 'S', # Ș
|
'ø': 'o', 'Ø': 'O', # ø Ø (Danish/Norwegian)
|
||||||
'\u021a': 'T', # Ț
|
|
||||||
# Older Unicode variants
|
|
||||||
'\u015f': 's', # ş (cedilla)
|
|
||||||
'\u0163': 't', # ţ (cedilla)
|
|
||||||
'\u015e': 'S', # Ş
|
|
||||||
'\u0162': 'T', # Ţ
|
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
def clean_web_text(text: str) -> str:
|
def clean_web_text(text: str) -> str:
|
||||||
"""Port of VFP CleanWebText: unescape HTML entities + diacritics to ASCII."""
|
"""Port of VFP CleanWebText: unescape HTML entities + strip diacritics to ASCII.
|
||||||
|
|
||||||
|
NFKD decomposition + combining-mark filter covers RO/HU/DE/CZ/PL/FR/ES in
|
||||||
|
one pass; _NFKD_OVERRIDES handles stroke letters NFKD leaves alone.
|
||||||
|
"""
|
||||||
if not text:
|
if not text:
|
||||||
return ""
|
return ""
|
||||||
result = html.unescape(text)
|
result = html.unescape(text)
|
||||||
result = result.translate(_DIACRITICS)
|
result = result.translate(_NFKD_OVERRIDES)
|
||||||
|
decomposed = unicodedata.normalize('NFKD', result)
|
||||||
|
result = ''.join(ch for ch in decomposed if not unicodedata.combining(ch))
|
||||||
# Remove any remaining <br> tags
|
# Remove any remaining <br> tags
|
||||||
for br in ('<br>', '<br/>', '<br />'):
|
for br in ('<br>', '<br/>', '<br />'):
|
||||||
result = result.replace(br, ' ')
|
result = result.replace(br, ' ')
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ def _addr_match(gomag_json, roa_json):
|
|||||||
r'ET|ETAJ|COM|COMUNA|SAT|MUN|MUNICIPIUL|JUD|JUDETUL|CARTIER|PARTER|SECTOR|SECTORUL|ORAS)(?:\b|(?=\d))'
|
r'ET|ETAJ|COM|COMUNA|SAT|MUN|MUNICIPIUL|JUD|JUDETUL|CARTIER|PARTER|SECTOR|SECTORUL|ORAS)(?:\b|(?=\d))'
|
||||||
)
|
)
|
||||||
def norm(s):
|
def norm(s):
|
||||||
s = (s or '').translate(import_service._DIACRITICS).upper()
|
s = import_service.clean_web_text(s or '').upper()
|
||||||
s = _ADDR_WORDS.sub('', s)
|
s = _ADDR_WORDS.sub('', s)
|
||||||
return re.sub(r'[^A-Z0-9]', '', s)
|
return re.sub(r'[^A-Z0-9]', '', s)
|
||||||
def _soundex(s):
|
def _soundex(s):
|
||||||
|
|||||||
@@ -821,6 +821,49 @@ class TestFormatAddressForOracle:
|
|||||||
assert result == "JUD:Bacau;Zemes;Str Principala Modarzau Blocuri"
|
assert result == "JUD:Bacau;Zemes;Str Principala Modarzau Blocuri"
|
||||||
|
|
||||||
|
|
||||||
|
class TestCleanWebTextDiacritics:
|
||||||
|
"""clean_web_text strips diacritics across RO/HU/DE/CZ/PL via NFKD."""
|
||||||
|
|
||||||
|
def test_hungarian_acute(self):
|
||||||
|
from app.services.import_service import clean_web_text
|
||||||
|
assert clean_web_text("BALÁZS LORÁNT") == "BALAZS LORANT"
|
||||||
|
|
||||||
|
def test_hungarian_double_acute(self):
|
||||||
|
from app.services.import_service import clean_web_text
|
||||||
|
assert clean_web_text("Lőrincz Ödön") == "Lorincz Odon"
|
||||||
|
assert clean_web_text("Erdős Pál") == "Erdos Pal"
|
||||||
|
|
||||||
|
def test_romanian_comma_below_modern(self):
|
||||||
|
from app.services.import_service import clean_web_text
|
||||||
|
assert clean_web_text("Ștefan Țîrcă") == "Stefan Tirca"
|
||||||
|
assert clean_web_text("ȘTEFAN ȚÎRCĂ") == "STEFAN TIRCA"
|
||||||
|
|
||||||
|
def test_romanian_cedilla_legacy_preserved(self):
|
||||||
|
"""Cedilla ş/ţ/Ş/Ţ must still normalize to s/t/S/T (regression guard)."""
|
||||||
|
from app.services.import_service import clean_web_text
|
||||||
|
assert clean_web_text("şcoala") == "scoala"
|
||||||
|
assert clean_web_text("ţara") == "tara"
|
||||||
|
assert clean_web_text("ŞTEFAN ŢARA") == "STEFAN TARA"
|
||||||
|
assert clean_web_text("IAŞI") == "IASI"
|
||||||
|
|
||||||
|
def test_german_umlaut_and_eszett(self):
|
||||||
|
from app.services.import_service import clean_web_text
|
||||||
|
assert clean_web_text("Müller Straße") == "Muller Strasse"
|
||||||
|
|
||||||
|
def test_czech_polish(self):
|
||||||
|
from app.services.import_service import clean_web_text
|
||||||
|
assert clean_web_text("Dvořák") == "Dvorak"
|
||||||
|
assert clean_web_text("Łódź") == "Lodz"
|
||||||
|
|
||||||
|
def test_html_entity_unescape(self):
|
||||||
|
from app.services.import_service import clean_web_text
|
||||||
|
assert clean_web_text("Café") == "Cafe"
|
||||||
|
|
||||||
|
def test_empty_input(self):
|
||||||
|
from app.services.import_service import clean_web_text
|
||||||
|
assert clean_web_text("") == ""
|
||||||
|
|
||||||
|
|
||||||
# ===========================================================================
|
# ===========================================================================
|
||||||
# Group 11: TestRefreshOrderAddress
|
# Group 11: TestRefreshOrderAddress
|
||||||
# ===========================================================================
|
# ===========================================================================
|
||||||
|
|||||||
Reference in New Issue
Block a user