[osm] Stripping standard city prefixes/suffies e.g. Township of
This commit is contained in:
31
scripts/geodata/names/normalization.py
Normal file
31
scripts/geodata/names/normalization.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
import re
|
||||||
|
|
||||||
|
from geodata.encoding import safe_decode
|
||||||
|
|
||||||
|
name_prefixes = ['{} '.format(s) for s in (
|
||||||
|
'city of',
|
||||||
|
'township of',
|
||||||
|
'municipality of',
|
||||||
|
'borough of',
|
||||||
|
'london borough of',
|
||||||
|
'town of',
|
||||||
|
)]
|
||||||
|
|
||||||
|
name_suffixes = [' {}'.format(s) for s in (
|
||||||
|
'township',
|
||||||
|
'municipality',
|
||||||
|
)]
|
||||||
|
|
||||||
|
name_prefix_regex = re.compile('^{}'.format('|'.join(name_prefixes)), re.I | re.UNICODE)
|
||||||
|
name_suffix_regex = re.compile('{}$'.format('|'.join(name_suffixes)), re.I | re.UNICODE)
|
||||||
|
|
||||||
|
|
||||||
|
def replace_name_prefixes(name):
|
||||||
|
name = safe_decode(name)
|
||||||
|
return name_prefix_regex.sub('', name)
|
||||||
|
|
||||||
|
|
||||||
|
def replace_name_suffixes(name):
|
||||||
|
name = safe_decode(name)
|
||||||
|
return name_suffix_regex.sub('', name)
|
||||||
@@ -63,6 +63,7 @@ from geodata.states.state_abbreviations import STATE_ABBREVIATIONS, STATE_EXPANS
|
|||||||
from geodata.language_id.polygon_lookup import country_and_languages
|
from geodata.language_id.polygon_lookup import country_and_languages
|
||||||
from geodata.i18n.languages import *
|
from geodata.i18n.languages import *
|
||||||
from geodata.address_formatting.formatter import AddressFormatter
|
from geodata.address_formatting.formatter import AddressFormatter
|
||||||
|
from geodata.names.normalization import replace_name_prefixes, replace_name_suffixes
|
||||||
from geodata.osm.extract import *
|
from geodata.osm.extract import *
|
||||||
from geodata.polygons.language_polys import *
|
from geodata.polygons.language_polys import *
|
||||||
from geodata.polygons.reverse_geocode import *
|
from geodata.polygons.reverse_geocode import *
|
||||||
@@ -192,6 +193,15 @@ osm_fields = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
REPLACE_COMPONENTS = (
|
||||||
|
AddressFormatter.SUBURB,
|
||||||
|
AddressFormatter.CITY_DISTRICT,
|
||||||
|
AddressFormatter.CITY,
|
||||||
|
AddressFormatter.STATE_DISTRICT,
|
||||||
|
AddressFormatter.STATE
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def write_osm_json(filename, out_filename):
|
def write_osm_json(filename, out_filename):
|
||||||
out = open(out_filename, 'w')
|
out = open(out_filename, 'w')
|
||||||
writer = csv.writer(out, 'tsv_no_quote')
|
writer = csv.writer(out, 'tsv_no_quote')
|
||||||
@@ -709,6 +719,18 @@ def build_address_format_training_data(admin_rtree, language_rtree, neighborhood
|
|||||||
if component not in address_components and random.random() < 0.5:
|
if component not in address_components and random.random() < 0.5:
|
||||||
address_components[component] = neighborhoods[0]
|
address_components[component] = neighborhoods[0]
|
||||||
|
|
||||||
|
'''
|
||||||
|
Name normalization
|
||||||
|
------------------
|
||||||
|
|
||||||
|
Probabilistically strip standard prefixes/suffixes e.g. "London Borough of"
|
||||||
|
'''
|
||||||
|
for component in REPLACE_COMPONENTS:
|
||||||
|
name = address_components[component]
|
||||||
|
replacement = replace_name_prefixes(replace_name_suffixes())
|
||||||
|
if replacement != name and random.random() < 0.6:
|
||||||
|
address_components[component] = replacement
|
||||||
|
|
||||||
# Version with all components
|
# Version with all components
|
||||||
formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components)
|
formatted_address = formatter.format_address(country, address_components, tag_components=tag_components, minimal_only=not tag_components)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user