[osm] adding new validity check to venue names so if the Jaccard(name tokens, street & house numer tokens) == 1 and the address does not have a known venue type e.g. a restaurant, the "venue name" is actually just the street address and can be discarded

This commit is contained in:
Al
2017-01-11 16:23:42 -05:00
parent 7f851810d2
commit bec569adaa

View File

@@ -37,6 +37,7 @@ from geodata.i18n.languages import *
from geodata.intersections.query import Intersection, IntersectionQuery
from geodata.address_formatting.formatter import AddressFormatter
from geodata.osm.components import osm_address_components
from geodata.osm.definitions import osm_definitions
from geodata.osm.extract import *
from geodata.osm.intersections import OSMIntersectionReader
from geodata.places.config import place_config
@@ -213,24 +214,77 @@ class OSMAddressFormatter(object):
return None
def valid_venue_name(self, name, address_components, languages=None, is_generic=False):
@classmethod
def alphanumeric_tokens(cls, name):
return [t for t, c in tokenize(name) if c in token_types.WORD_TOKEN_TYPES or c in token_types.NUMERIC_TOKEN_TYPES]
@classmethod
def valid_venue_name(cls, name, address_components, languages=(), is_generic=False, is_known_venue_type=False):
'''
The definition of "what is a venue" is pretty loose in libpostal, but
there are several cases we want to avoid:
Firstly, if the venue name is the same as the house number, usually in the
case of a named building, it's not valid.
Sometimes a particular street is synonymous with a tourist attraction or neighborhood
or historic district and OSM might label it as something like a landmark,
which passes the "named place" test but is really just a street, so if the
name of the place is exactly equal to the street name and there's no house number
(this still allows for bars or restaurants, etc. that might be named after the
street they're on), call it invalid.
Often the name or a building in OSM will just be its street address. Example:
addr:housename="123 Park Place"
addr:housenumber="123"
addr:street="Park Place"
This method will invalidate the venue name if and only if
the set of unique tokens in the venue name is exactly equal
to the set of unique tokens in the house number + street name.
This does not apply to known venue typess (like a restaurant)
where we'd want to preserve the name.
'''
if not name:
return False
name_norm = name.strip().lower()
if AddressFormatter.ROAD in address_components and address_components[AddressFormatter.ROAD].strip().lower() == name_norm:
street = address_components.get(AddressFormatter.ROAD)
if street:
street = street.strip().lower()
house_number = address_components.get(AddressFormatter.HOUSE_NUMBER)
if house_number:
house_number = house_number.strip().lower()
if street and street == name_norm and not house_number:
return False
if AddressFormatter.HOUSE_NUMBER not in address_components:
if not house_number:
if not any((c in token_types.WORD_TOKEN_TYPES for t, c in tokenize(name))):
return False
if AddressFormatter.ROAD not in address_components:
if not street:
for component, place_name in six.iteritems(address_components):
if (component in AddressFormatter.BOUNDARY_COMPONENTS or component == AddressFormatter.HOUSE_NUMBER) and place_name.strip().lower() == name_norm:
return False
if street and not is_known_venue_type:
unique_tokens_name = set([t.lower() for t in cls.alphanumeric_tokens(name)])
unique_tokens_street_house_number = set()
if street:
unique_tokens_street_house_number.update([t.lower() for t in cls.alphanumeric_tokens(street)])
if house_number:
unique_tokens_street_house_number.update([t.lower() for t in cls.alphanumeric_tokens(house_number)])
if len(unique_tokens_name) == len(unique_tokens_street_house_number) == len(unique_tokens_name & unique_tokens_street_house_number):
return False
venue_phrases = venue_names_gazetteer.extract_phrases(name, languages=languages)
street_phrases = street_types_only_gazetteer.extract_phrases(name, languages=languages)
@@ -837,9 +891,17 @@ class OSMAddressFormatter(object):
return revised_place_tags, country
def is_generic_place(self, tags):
@classmethod
def is_generic_place(cls, tags):
return tags.get('railway', '').lower() == 'station' or tags.get('amenity', '').lower() == 'post_office'
@classmethod
def is_known_venue_type(cls, tags):
for definition in (osm_definitions.AMENITY, osm_definitions.SHOP, osm_definitions.AEROWAY):
if osm_definitions.meets_definition(tags, definition):
return True
return False
def category_queries(self, tags, address_components, language, country=None, tag_components=True):
formatted_addresses = []
possible_category_keys = category_config.has_keys(language, tags)
@@ -951,6 +1013,7 @@ class OSMAddressFormatter(object):
self.remove_japanese_suburb_tags(tags)
is_generic_place = self.is_generic_place(tags)
is_known_venue_type = self.is_known_venue_type(tags)
revised_tags = self.normalize_address_components(tags)
if japanese_suburb is not None:
@@ -971,7 +1034,7 @@ class OSMAddressFormatter(object):
building_venue_names = []
building_components = self.building_components(latitude, longitude)
building_is_generic_place = False
if building_components:
num_floors = self.num_floors(building_components)
@@ -980,13 +1043,15 @@ class OSMAddressFormatter(object):
for building_tags in building_components:
building_tags = self.normalize_address_components(building_tags)
building_is_generic_place = building_is_generic_place or self.is_generic_place(building_tags)
building_is_known_venue_type = building_is_known_venue_type or self.is_known_venue_type(building_tags)
for k, v in six.iteritems(building_tags):
if k not in revised_tags and k in (AddressFormatter.HOUSE_NUMBER, AddressFormatter.ROAD, AddressFormatter.POSTCODE):
revised_tags[k] = v
elif k == AddressFormatter.HOUSE:
building_venue_names.append(v)
building_venue_names.append((v, building_is_generic_place, building_is_known_venue_type))
building_is_generic_place = building_is_generic_place or self.is_generic_place(building_tags)
subdivision_components = self.subdivision_components(latitude, longitude)
if subdivision_components:
@@ -1027,7 +1092,7 @@ class OSMAddressFormatter(object):
street_languages = set((language,) if language not in (UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE) else languages)
venue_names = [venue_name for venue_name in venue_names if self.valid_venue_name(venue_name, expanded_components, street_languages, is_generic=is_generic_place)]
venue_names.extend([venue_name for venue_name in building_venue_names if self.valid_venue_name(venue_name, expanded_components, street_languages, is_generic=building_is_generic_place)])
venue_names.extend([venue_name for venue_name, building_is_generic_place, building_is_known_venue_type in building_venue_names if self.valid_venue_name(venue_name, expanded_components, street_languages, is_generic=building_is_generic_place, is_known_venue_type=building_is_known_venue_type)])
all_venue_names = set(venue_names)
@@ -1292,9 +1357,8 @@ class OSMAddressFormatter(object):
default_language = candidate_languages[0][0]
elif not more_than_one_official_language:
default_language = None
name = way['name']
if not name:
continue
name = way.get('name')
if name:
address_language = self.components.address_language({AddressFormatter.ROAD: name}, candidate_languages)
if address_language and address_language not in (UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE):
default_language = address_language