[addresses] Adding normalized_place_name, a method for separating compound fields like addr:city='New York NY' into simply 'New York', solving the compound phrase problem. Also solves the mislabeled place name problem, causing the system to ignore the user tag and fall back on reverse geocoded components in cases e.g. where addr:city='Harlem', which is a known neighborhood but not a city when reverse geocoded. A few other refactors for expanded address components
This commit is contained in:
@@ -4,6 +4,7 @@ import random
|
||||
from collections import defaultdict
|
||||
|
||||
from geodata.address_formatting.formatter import AddressFormatter
|
||||
from geodata.address_formatting.aliases import Aliases
|
||||
|
||||
from geodata.addresses.floors import Floor
|
||||
from geodata.addresses.units import Unit
|
||||
@@ -12,7 +13,7 @@ from geodata.language_id.disambiguation import *
|
||||
from geodata.language_id.sample import sample_random_language
|
||||
from geodata.names.normalization import replace_name_prefixes, replace_name_suffixes
|
||||
from geodata.osm.extract import osm_address_components
|
||||
from geodata.states.state_abbreviations import STATE_ABBREVIATIONS, STATE_EXPANSIONS
|
||||
from geodata.states.state_abbreviations import state_abbreviations
|
||||
|
||||
|
||||
class AddressExpander(object):
|
||||
@@ -24,7 +25,7 @@ class AddressExpander(object):
|
||||
directly to AddressFormatter.format_address to produce training examples.
|
||||
|
||||
There are several steps in expanding an address including reverse geocoding
|
||||
to polygons, disambiguating which language the address uses, stripping standard
|
||||
to polygons, disambiguating which language the address uses, stripping standard
|
||||
prefixes like "London Borough of", pruning duplicates like "Antwerpen, Antwerpen, Antwerpen".
|
||||
|
||||
Usage:
|
||||
@@ -48,6 +49,7 @@ class AddressExpander(object):
|
||||
rare_components = {
|
||||
AddressFormatter.SUBURB,
|
||||
AddressFormatter.CITY_DISTRICT,
|
||||
AddressFormatter.ISLAND,
|
||||
AddressFormatter.STATE_DISTRICT,
|
||||
AddressFormatter.STATE,
|
||||
}
|
||||
@@ -56,6 +58,7 @@ class AddressExpander(object):
|
||||
AddressFormatter.SUBURB,
|
||||
AddressFormatter.CITY_DISTRICT,
|
||||
AddressFormatter.CITY,
|
||||
AddressFormatter.ISLAND,
|
||||
AddressFormatter.STATE_DISTRICT,
|
||||
AddressFormatter.STATE
|
||||
)
|
||||
@@ -75,6 +78,10 @@ class AddressExpander(object):
|
||||
}
|
||||
}
|
||||
|
||||
ALL_OSM_NAME_KEYS = set(['name', 'name:simple',
|
||||
'ISO3166-1:alpha2', 'ISO3166-1:alpha3',
|
||||
'short_name', 'alt_name', 'official_name'])
|
||||
|
||||
def __init__(self, osm_admin_rtree, language_rtree, neighborhoods_rtree, buildings_rtree, subdivisions_rtree, quattroshapes_rtree, geonames):
|
||||
self.osm_admin_rtree = osm_admin_rtree
|
||||
self.language_rtree = language_rtree
|
||||
@@ -130,7 +137,10 @@ class AddressExpander(object):
|
||||
|
||||
def pick_random_name_key(self, suffix=''):
|
||||
'''
|
||||
|
||||
Random name
|
||||
-----------
|
||||
|
||||
Pick a name key from OSM
|
||||
'''
|
||||
name_key = ''.join(('name', suffix))
|
||||
raw_name_key = 'name'
|
||||
@@ -156,17 +166,96 @@ class AddressExpander(object):
|
||||
key = official_name_key
|
||||
raw_key = raw_official_name_key
|
||||
else:
|
||||
# 10% of the time use the official name
|
||||
# 10% of the time use the alt name
|
||||
key = alt_name_key
|
||||
raw_key = raw_alt_name_key
|
||||
|
||||
return key, raw_key
|
||||
|
||||
def contains_multiple_place_names()
|
||||
def all_names(self, props, languages=None):
|
||||
names = set()
|
||||
for k, v in six.iteritems(props):
|
||||
if k in self.ALL_OSM_NAME_KEYS:
|
||||
names.add(v)
|
||||
elif ':' in k:
|
||||
k, qual = k.split(':', 1)
|
||||
if k in self.ALL_OSM_NAME_KEYS and qual.split('_', 1)[0] in languages:
|
||||
names.add(v)
|
||||
return names
|
||||
|
||||
def normalize_address_components(self, value):
|
||||
address_components = {k: v for k, v in value.iteritems() if k in self.formatter.aliases}
|
||||
self.formatter.replace_aliases(address_components)
|
||||
def normalized_place_name(self, name, tag, osm_components, country=None, state=None, languages=None, whitespace=True):
|
||||
'''
|
||||
Multiple place names
|
||||
--------------------
|
||||
|
||||
This is to help with things like addr:city="New York NY"
|
||||
'''
|
||||
|
||||
names = set()
|
||||
|
||||
components = defaultdict(set)
|
||||
for props in osm_components:
|
||||
component_names = self.all_names(props, languages=languages)
|
||||
names |= component_names
|
||||
|
||||
for k, v in six.iteritems(props):
|
||||
normalized_key = osm_address_components.get_component(country, k, v)
|
||||
for cn in component_names:
|
||||
components[cn.lower()].add(normalized_key)
|
||||
|
||||
if country and languages and state:
|
||||
for language in languages:
|
||||
state_code = state_abbreviations.get_abbreviation(country, language, state)
|
||||
if state_code:
|
||||
names.add(state_code.upper())
|
||||
|
||||
phrase_filter = PhraseFilter([(n.lower(), '') for n in names])
|
||||
|
||||
tokens = tokenize(name)
|
||||
tokens_lower = [(t.lower(), c) for t, c in tokens]
|
||||
phrases = list(phrase_filter.filter(tokens_lower))
|
||||
|
||||
num_phrases = 0
|
||||
total_tokens = 0
|
||||
for is_phrase, phrase_tokens, value in phrases:
|
||||
if is_phrase:
|
||||
join_phrase = six.u(' ') if whitespace else six.u('')
|
||||
if num_phrases > 0:
|
||||
return join_phrase.join([t for t, c in tokens[:total_tokens]])
|
||||
elif num_phrases == 0 and total_tokens > 0:
|
||||
phrase = join_phrase.join([t for t, c in phrase_tokens])
|
||||
if tag not in components.get(phrase, set()):
|
||||
return None
|
||||
|
||||
current_phrase = tokens[total_tokens:total_tokens + len(phrase_tokens)]
|
||||
total_tokens += len(phrase_tokens)
|
||||
num_phrases += 1
|
||||
else:
|
||||
total_tokens += 1
|
||||
|
||||
# If the name contains a comma, stop and only use the phrase before the comma
|
||||
if ',' in name:
|
||||
return name.split(',')[0].strip()
|
||||
|
||||
return name
|
||||
|
||||
def normalize_place_names(self, address_components, osm_components, country=None, languages=None, whitespace=True):
|
||||
components = {}
|
||||
state = address_components.get(AddressFormatter.STATE, None)
|
||||
|
||||
for key in list(address_components):
|
||||
name = address_components[key]
|
||||
if key in self.BOUNDARY_COMPONENTS:
|
||||
name = self.normalized_place_name(name, key, osm_components, country=country,
|
||||
state=state, languages=languages, whitespace=whitespace)
|
||||
|
||||
components[key] = name
|
||||
return components
|
||||
|
||||
def normalize_address_components(self, components):
|
||||
address_components = {k: v for k, v in components.iteritems()
|
||||
if k in self.formatter.aliases}
|
||||
self.formatter.aliases.replace(address_components)
|
||||
return address_components
|
||||
|
||||
def country_name(self, address_components, country_code, language,
|
||||
@@ -247,7 +336,7 @@ class AddressExpander(object):
|
||||
address_state = address_components.get(AddressFormatter.STATE)
|
||||
|
||||
if address_state and country and not non_local_language:
|
||||
state_full_name = STATE_ABBREVIATIONS.get(country.upper(), {}).get(address_state.upper(), {}).get(language)
|
||||
state_full_name = state_abbreviations.get_full_name(country, language, address_state)
|
||||
|
||||
if state_full_name and random.random() < state_full_name_prob:
|
||||
address_state = state_full_name
|
||||
@@ -266,8 +355,8 @@ class AddressExpander(object):
|
||||
return osm_suffix
|
||||
|
||||
def add_admin_boundaries(self, address_components,
|
||||
osm_components,
|
||||
country, language,
|
||||
latitude, longitude,
|
||||
osm_suffix='',
|
||||
non_local_language=None,
|
||||
random_key=True,
|
||||
@@ -294,8 +383,6 @@ class AddressExpander(object):
|
||||
include these qualifiers in the training data.
|
||||
'''
|
||||
|
||||
osm_components = self.osm_reverse_geocoded_components(country, latitude, longitude)
|
||||
|
||||
name_key = ''.join(('name', osm_suffix))
|
||||
raw_name_key = 'name'
|
||||
simple_name_key = 'name:simple'
|
||||
@@ -352,12 +439,13 @@ class AddressExpander(object):
|
||||
if component not in address_components or (non_local_language and random.random() < replace_with_non_local_prob):
|
||||
if component == AddressFormatter.STATE_DISTRICT and random.random() < join_state_district_prob:
|
||||
num = random.randrange(1, len(vals) + 1)
|
||||
val = u', '.join(vals[:num])
|
||||
val = six.u(', ').join(vals[:num])
|
||||
else:
|
||||
val = random.choice(vals)
|
||||
|
||||
if component == AddressFormatter.STATE and random.random() < expand_state_prob:
|
||||
val = STATE_EXPANSIONS.get(country.upper(), {}).get(val, val)
|
||||
val = state_abbreviations.get_full_name(country, language, val, default=val)
|
||||
|
||||
address_components[component] = val
|
||||
|
||||
def quattroshapes_city(self, address_components,
|
||||
@@ -409,8 +497,11 @@ class AddressExpander(object):
|
||||
|
||||
return city
|
||||
|
||||
def neighborhood_components(self, latitude, longitude):
|
||||
return self.neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True)
|
||||
|
||||
def add_neighborhoods(self, address_components,
|
||||
latitude, longitude,
|
||||
neighborhoods,
|
||||
osm_suffix='',
|
||||
add_prefix_prob=0.5,
|
||||
add_neighborhood_prob=0.5):
|
||||
@@ -426,7 +517,6 @@ class AddressExpander(object):
|
||||
on the whole of better quality).
|
||||
'''
|
||||
|
||||
neighborhoods = self.neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True)
|
||||
neighborhood_levels = defaultdict(list)
|
||||
|
||||
name_key = ''.join(('name', osm_suffix))
|
||||
@@ -445,7 +535,7 @@ class AddressExpander(object):
|
||||
name_prefix = neighborhood.get('name:prefix')
|
||||
|
||||
if name_prefix and random.random() < add_prefix_prob:
|
||||
name = u' '.join([name_prefix, name])
|
||||
name = six.u(' ').join([name_prefix, name])
|
||||
|
||||
if not name:
|
||||
continue
|
||||
@@ -468,7 +558,7 @@ class AddressExpander(object):
|
||||
if component not in address_components and random.random() < add_neighborhood_prob:
|
||||
address_components[component] = neighborhoods[0]
|
||||
|
||||
def normalize_names(self, address_components, replacement_prob=0.6):
|
||||
def replace_name_affixes(self, address_components, replacement_prob=0.6):
|
||||
'''
|
||||
Name normalization
|
||||
------------------
|
||||
@@ -576,7 +666,9 @@ class AddressExpander(object):
|
||||
|
||||
osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language)
|
||||
|
||||
self.add_admin_boundaries(address_components, country, language, latitude, longitude,
|
||||
osm_components = self.osm_reverse_geocoded_components(country, latitude, longitude)
|
||||
|
||||
self.add_admin_boundaries(address_components, osm_components, country, language,
|
||||
non_local_language=non_local_language,
|
||||
osm_suffix=osm_suffix)
|
||||
|
||||
@@ -584,12 +676,17 @@ class AddressExpander(object):
|
||||
if city:
|
||||
address_components[AddressFormatter.CITY] = city
|
||||
|
||||
self.add_neighborhoods(address_components, latitude, longitude,
|
||||
neighborhoods = self.neighborhood_components(latitude, longitude)
|
||||
|
||||
self.add_neighborhoods(address_components, neighborhoods,
|
||||
osm_suffix=osm_suffix)
|
||||
|
||||
street = address_components.get(AddressFormatter.ROAD)
|
||||
|
||||
self.normalize_names(address_components)
|
||||
all_osm_components = osm_components + neighborhoods
|
||||
self.normalize_place_names(address_components, all_osm_components, country=country)
|
||||
|
||||
self.replace_name_affixes(address_components)
|
||||
|
||||
self.replace_names(address_components)
|
||||
|
||||
@@ -638,24 +735,33 @@ class AddressExpander(object):
|
||||
|
||||
osm_suffix = self.tag_suffix(language, non_local_language, more_than_one_official_language)
|
||||
|
||||
self.add_osm_boundaries(address_components, country, language, latitude, longitude,
|
||||
osm_suffix=osm_suffix,
|
||||
non_local_language=non_local_language,
|
||||
random_key=False,
|
||||
alpha_3_iso_code_prob=0.0,
|
||||
alpha_2_iso_code_prob=0.0,
|
||||
replace_with_non_local_prob=0.0,
|
||||
expand_state_prob=1.0)
|
||||
osm_components = self.osm_reverse_geocoded_components(country, latitude, longitude)
|
||||
|
||||
self.add_admin_boundaries(address_components, osm_components, country, language,
|
||||
osm_suffix=osm_suffix,
|
||||
non_local_language=non_local_language,
|
||||
random_key=False,
|
||||
alpha_3_iso_code_prob=0.0,
|
||||
alpha_2_iso_code_prob=0.0,
|
||||
replace_with_non_local_prob=0.0,
|
||||
expand_state_prob=1.0)
|
||||
|
||||
city = self.quattroshapes_city(address_components, latitude, longitude, language, non_local_language=non_local_language)
|
||||
|
||||
if city:
|
||||
address_components[AddressFormatter.CITY] = city
|
||||
|
||||
self.add_neighborhoods(address_components, latitude, longitude,
|
||||
neighborhoods = self.neighborhood_components(latitude, longitude)
|
||||
|
||||
self.add_neighborhoods(address_components, neighborhoods,
|
||||
osm_suffix=osm_suffix)
|
||||
|
||||
self.normalize_names(address_components)
|
||||
all_osm_components = osm_components + neighborhoods
|
||||
self.normalize_place_names(address_components, all_osm_components, country=country)
|
||||
|
||||
self.replace_name_affixes(address_components)
|
||||
|
||||
self.replace_names(address_components)
|
||||
|
||||
self.prune_duplicate_names(address_components)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user