fix(address): extract scara/etaj/apartament from comma-less addresses
Oracle parser failed to extract sc/ap/et when GoMag addresses had no commas. Added REGEXP_REPLACE to insert commas before address keywords in v_strada before the comma-split, ensuring the token parser always fires. Also added 5 Oracle integration tests calling parseaza_adresa_semicolon directly, and improved diacritics handling in addr_match (Python + JS). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,7 +2,6 @@ import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import unicodedata
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from zoneinfo import ZoneInfo
|
||||
@@ -36,8 +35,7 @@ def _addr_match(gomag_json, roa_json):
|
||||
r'ET|ETAJ|COM|COMUNA|SAT|MUN|MUNICIPIUL|JUD|JUDETUL|CARTIER|PARTER|SECTOR|SECTORUL|ORAS)(?:\b|(?=\d))'
|
||||
)
|
||||
def norm(s):
|
||||
s = unicodedata.normalize('NFD', s or '')
|
||||
s = re.sub(r'[\u0300-\u036f]', '', s).upper()
|
||||
s = (s or '').translate(import_service._DIACRITICS).upper()
|
||||
s = _ADDR_WORDS.sub('', s)
|
||||
return re.sub(r'[^A-Z0-9]', '', s)
|
||||
g_street = norm(g.get('address') or g.get('strada') or '')
|
||||
|
||||
Reference in New Issue
Block a user