[osm/addresses] using new is_numeric in AddressComponents expansion and removing venue names that are identical to the house number

This commit is contained in:
Al
2016-06-23 13:59:40 -04:00
parent 106dfa80c3
commit cf2ed2b299

View File

@@ -31,6 +31,7 @@ from geodata.names.normalization import name_affixes
from geodata.osm.components import osm_address_components
from geodata.places.config import place_config
from geodata.states.state_abbreviations import state_abbreviations
from geodata.text.utils import is_numeric
this_dir = os.path.realpath(os.path.dirname(__file__))
@@ -405,7 +406,7 @@ class AddressComponents(object):
for k, combo in six.iteritems(combo_config):
components = OrderedDict.fromkeys(combo['components']).keys()
if not all((c in address_components and (c in generated_components or self.is_numeric(address_components[c])) for c in components)):
if not all((c in address_components and (c in generated_components or is_numeric(address_components[c])) for c in components)):
continue
combos.append((len(components), combo))
@@ -471,13 +472,9 @@ class AddressComponents(object):
else:
return num_type
def is_numeric(self, component):
tokens = tokenize(component)
return sum((1 for t, c in tokens if c == token_types.NUMERIC or c not in token_types.WORD_TOKEN_TYPES)) == len(tokens)
def get_component_phrase(self, cls, component, language, country=None):
component = safe_decode(component)
if self.is_numeric(component):
if is_numeric(component):
phrase = cls.phrase(component, language, country=country)
if phrase != component:
return phrase
@@ -937,6 +934,23 @@ class AddressComponents(object):
for component in components[1:]:
address_components.pop(component, None)
def cleanup_venue_name(self, address_components):
'''
Venue name cleanup
------------------
A venue name that's the same as the house number is not valid.
This occurs sometimes in OSM where perhaps "7" could be the name
of the building but also its house number.
'''
venue_name = address_components.get(AddressFormatter.HOUSE)
house_number = address_components.get(AddressFormatter.HOUSE_NUMBER)
if venue_name and house_number and venue_name.strip() == house_number.strip():
address_components.pop(AddressFormatter.HOUSE)
def cleanup_house_number(self, address_components):
'''
House number cleanup
@@ -1108,6 +1122,8 @@ class AddressComponents(object):
self.prune_duplicate_names(address_components)
self.cleanup_venue_name(address_components)
self.cleanup_house_number(address_components)
self.add_house_number_phrase(address_components, language, country=country)
self.add_postcode_phrase(address_components, language, country=country)