fix(import): NFKD normalization for non-Romanian diacritics
clean_web_text used a hard-coded Romanian-only translation map, so Hungarian
(BALÁZS LORÁNT), German, Czech, Polish names passed through unchanged into
SQLite and Oracle ROA. Replace with unicodedata.normalize('NFKD') + combining
mark strip — covers RO/HU/DE/CZ/PL/FR/ES universally. Romanian cedilla legacy
forms (ş/ţ/Ş/Ţ) remain handled (NFKD decomposes to base + combining cedilla).
Stroke letters not decomposed by NFKD (ß, ł, đ, ø, æ, œ) covered via
_NFKD_OVERRIDES translation map.
sync_service._addr_match.norm migrated off the removed _DIACRITICS constant
to clean_web_text; address matching now also handles non-RO diacritics.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -821,6 +821,49 @@ class TestFormatAddressForOracle:
|
||||
assert result == "JUD:Bacau;Zemes;Str Principala Modarzau Blocuri"
|
||||
|
||||
|
||||
class TestCleanWebTextDiacritics:
|
||||
"""clean_web_text strips diacritics across RO/HU/DE/CZ/PL via NFKD."""
|
||||
|
||||
def test_hungarian_acute(self):
|
||||
from app.services.import_service import clean_web_text
|
||||
assert clean_web_text("BALÁZS LORÁNT") == "BALAZS LORANT"
|
||||
|
||||
def test_hungarian_double_acute(self):
|
||||
from app.services.import_service import clean_web_text
|
||||
assert clean_web_text("Lőrincz Ödön") == "Lorincz Odon"
|
||||
assert clean_web_text("Erdős Pál") == "Erdos Pal"
|
||||
|
||||
def test_romanian_comma_below_modern(self):
|
||||
from app.services.import_service import clean_web_text
|
||||
assert clean_web_text("Ștefan Țîrcă") == "Stefan Tirca"
|
||||
assert clean_web_text("ȘTEFAN ȚÎRCĂ") == "STEFAN TIRCA"
|
||||
|
||||
def test_romanian_cedilla_legacy_preserved(self):
|
||||
"""Cedilla ş/ţ/Ş/Ţ must still normalize to s/t/S/T (regression guard)."""
|
||||
from app.services.import_service import clean_web_text
|
||||
assert clean_web_text("şcoala") == "scoala"
|
||||
assert clean_web_text("ţara") == "tara"
|
||||
assert clean_web_text("ŞTEFAN ŢARA") == "STEFAN TARA"
|
||||
assert clean_web_text("IAŞI") == "IASI"
|
||||
|
||||
def test_german_umlaut_and_eszett(self):
|
||||
from app.services.import_service import clean_web_text
|
||||
assert clean_web_text("Müller Straße") == "Muller Strasse"
|
||||
|
||||
def test_czech_polish(self):
|
||||
from app.services.import_service import clean_web_text
|
||||
assert clean_web_text("Dvořák") == "Dvorak"
|
||||
assert clean_web_text("Łódź") == "Lodz"
|
||||
|
||||
def test_html_entity_unescape(self):
|
||||
from app.services.import_service import clean_web_text
|
||||
assert clean_web_text("Café") == "Cafe"
|
||||
|
||||
def test_empty_input(self):
|
||||
from app.services.import_service import clean_web_text
|
||||
assert clean_web_text("") == ""
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Group 11: TestRefreshOrderAddress
|
||||
# ===========================================================================
|
||||
|
||||
Reference in New Issue
Block a user