fix(address): extract scara/etaj/apartament from comma-less addresses
Oracle parser failed to extract sc/ap/et when GoMag addresses had no commas. Added REGEXP_REPLACE to insert commas before address keywords in v_strada before the comma-split, ensuring the token parser always fires. Also added 5 Oracle integration tests calling parseaza_adresa_semicolon directly, and improved diacritics handling in addr_match (Python + JS). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,7 +2,6 @@ import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import unicodedata
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from zoneinfo import ZoneInfo
|
||||
@@ -36,8 +35,7 @@ def _addr_match(gomag_json, roa_json):
|
||||
r'ET|ETAJ|COM|COMUNA|SAT|MUN|MUNICIPIUL|JUD|JUDETUL|CARTIER|PARTER|SECTOR|SECTORUL|ORAS)(?:\b|(?=\d))'
|
||||
)
|
||||
def norm(s):
|
||||
s = unicodedata.normalize('NFD', s or '')
|
||||
s = re.sub(r'[\u0300-\u036f]', '', s).upper()
|
||||
s = (s or '').translate(import_service._DIACRITICS).upper()
|
||||
s = _ADDR_WORDS.sub('', s)
|
||||
return re.sub(r'[^A-Z0-9]', '', s)
|
||||
g_street = norm(g.get('address') or g.get('strada') or '')
|
||||
|
||||
@@ -822,8 +822,13 @@ function fmtAddr(a) {
|
||||
|
||||
function addrMatch(gomag, roa) {
|
||||
if (!gomag || !roa) return true; // can't compare
|
||||
const _DIAC = {
|
||||
'\u0103':'a','\u00e2':'a','\u00ee':'i','\u0219':'s','\u021b':'t',
|
||||
'\u0102':'A','\u00c2':'A','\u00ce':'I','\u0218':'S','\u021a':'T',
|
||||
'\u015f':'s','\u0163':'t','\u015e':'S','\u0162':'T'
|
||||
};
|
||||
function norm(s) {
|
||||
return (s || '').normalize('NFD').replace(/[\u0300-\u036f]/g, '')
|
||||
return (s || '').replace(/[\u0103\u00e2\u00ee\u0219\u021b\u0102\u00c2\u00ce\u0218\u021a\u015f\u0163\u015e\u0162]/g, c => _DIAC[c] || c)
|
||||
.toUpperCase()
|
||||
.replace(/\b(STR|STRADA|NR|NUMAR|NUMARUL|BL|BLOC|SC|SCARA|AP|APART|APARTAMENT|ET|ETAJ|COM|COMUNA|SAT|MUN|MUNICIPIUL|JUD|JUDETUL|CARTIER|PARTER|SECTOR|SECTORUL|ORAS)(?:\b|(?=\d))/g, '')
|
||||
.replace(/[^A-Z0-9]/g, '');
|
||||
|
||||
Reference in New Issue
Block a user