[boundaries] in addition to population, check if a city has an unambiguous Wikipedia

This commit is contained in:
Al
2016-11-25 13:29:59 -08:00
parent 78c1a40708
commit cdbc102821
4 changed files with 57 additions and 19 deletions

View File

@@ -909,6 +909,34 @@ class AddressComponents(object):
return city
generic_wiki_name_regex = re.compile('^[a-z]{2,3}:')
@classmethod
def unambiguous_wikipedia(cls, osm_component, language):
name = osm_component.get('name')
if not name:
return False
wiki_name = osm_component.get('wikipedia:{}'.format(language))
if not wiki_name:
wiki_name = osm_component.get('wikipedia')
if wiki_name:
if (language not in (UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE) and wiki_name.lower().startswith(six.u('{}:'.format(language))) or cls.generic_wiki_name_regex.match(wiki_name)):
wiki_name = wiki_name.split(six.u(':'), 1)[-1]
norm_name = safe_decode(name).strip().lower()
if not wiki_name and language in (UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE):
for k, v in six.iteritems(osm_component):
if k.startswith('wikipedia:') and safe_decode(v).strip().lower() == norm_name:
return True
else:
return False
elif not wiki_name:
return False
return norm_name == safe_decode(wiki_name).strip().lower()
def neighborhood_components(self, latitude, longitude):
return self.neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True)
@@ -1370,7 +1398,8 @@ class AddressComponents(object):
return False
def expanded(self, address_components, latitude, longitude, language=None,
dropout_places=True, population=None, population_from_city=False,
dropout_places=True, population=None,
population_from_city=False, check_city_wikipedia=False,
add_sub_building_components=True, hyphenation=True,
num_floors=None, num_basements=None, zone=None,
osm_components=None):
@@ -1464,19 +1493,25 @@ class AddressComponents(object):
# Population of the city helps us determine if the city can be used
# on its own like "Seattle" or "New York" vs. smaller cities like
# have to be qualified with a state, country, etc.
if population is None and population_from_city:
tagged = self.categorized_osm_components(country, osm_components)
for props, component in (tagged or []):
if component == AddressFormatter.CITY and 'population' in props:
try:
population = int(props['population'])
except (ValueError, TypeError):
continue
unambiguous_city = False
if population is None and population_from_city:
population = 0
tagged = self.categorized_osm_components(country, osm_components)
for props, component in (tagged or []):
if component == AddressFormatter.CITY:
if self.unambiguous_wikipedia(component, language):
unambiguous_city = True
if 'population' in props:
try:
population = int(props['population'])
except (ValueError, TypeError):
continue
# Perform dropout on places
address_components = place_config.dropout_components(address_components, all_osm_components, country=country, population=population)
address_components = place_config.dropout_components(address_components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city)
self.drop_invalid_components(address_components, country)

View File

@@ -427,13 +427,16 @@ class OpenAddressesFormatter(object):
# very small and the place name shouldn't be used unqualified (i.e. needs information
# like state name to disambiguate it)
population = 0
unambiguous_city = False
if add_osm_boundaries or AddressFormatter.CITY not in components:
osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude)
self.components.add_admin_boundaries(components, osm_components, country, language)
categorized = self.components.categorized_osm_components(country, osm_components)
for component, label in categorized:
if label == AddressFormatter.CITY and 'population' in component:
population = component['population']
if label == AddressFormatter.CITY:
unambiguous_city = self.components.unambiguous_wikipedia(component, language)
if 'population' in component:
population = component['population']
break
if AddressFormatter.CITY not in components and city_replacements:
@@ -460,7 +463,7 @@ class OpenAddressesFormatter(object):
# Component dropout
all_osm_components = osm_components + neighborhood_components
components = place_config.dropout_components(components, all_osm_components, country=country, population=population)
components = place_config.dropout_components(components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city)
formatted = self.formatter.format_address(components, country, language=language,
minimal_only=False, tag_components=tag_components)

View File

@@ -920,7 +920,7 @@ class OSMAddressFormatter(object):
address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, language=language or namespaced_language,
num_floors=num_floors, num_basements=num_basements,
zone=zone, add_sub_building_components=add_sub_building_components,
population_from_city=True, osm_components=osm_components)
population_from_city=True, check_city_wikipedia=True, osm_components=osm_components)
languages = list(country_languages[country])
venue_names = self.venue_names(tags, languages) or []

View File

@@ -121,8 +121,8 @@ class PlaceConfig(object):
return random.random() < probability
def include_component(self, component, containing_ids, country=None, population=None, check_population=True):
if check_population:
def include_component(self, component, containing_ids, country=None, population=None, check_population=True, unambiguous_city=False):
if check_population and not unambiguous_city:
population_exceptions = self.get_property(('components', component, 'population'), country=country, default=None)
if population_exceptions and self.include_by_population_exceptions(population_exceptions, population=population or 0):
return True
@@ -143,7 +143,7 @@ class PlaceConfig(object):
address_components.pop(c)
component_bitset ^= ComponentDependencies.component_bit_values[c]
def dropout_components(self, components, boundaries=(), country=None, population=None):
def dropout_components(self, components, boundaries=(), country=None, population=None, unambiguous_city=False):
containing_ids = set()
for boundary in boundaries:
@@ -172,7 +172,7 @@ class PlaceConfig(object):
city_replacements = set(self.get_property(('city_replacements', ), country=country))
for component in admin_components:
include = self.include_component(component, containing_ids, country=country, population=population)
include = self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city)
if not include and component not in city_replacements:
# Note: this check is for cities that have the same name as their admin
@@ -201,7 +201,7 @@ class PlaceConfig(object):
if values is not None:
value = weighted_choice(values, probs)
if value is not None and component not in components and self.include_component(component, containing_ids, country=country, population=population):
if value is not None and component not in components and self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city):
new_components[component] = value
self.drop_invalid_components(new_components, country, original_bitset=original_bitset)