[boundaries] in addition to population, check if a city has an unambiguous Wikipedia
This commit is contained in:
@@ -909,6 +909,34 @@ class AddressComponents(object):
|
||||
|
||||
return city
|
||||
|
||||
generic_wiki_name_regex = re.compile('^[a-z]{2,3}:')
|
||||
|
||||
@classmethod
|
||||
def unambiguous_wikipedia(cls, osm_component, language):
|
||||
name = osm_component.get('name')
|
||||
if not name:
|
||||
return False
|
||||
|
||||
wiki_name = osm_component.get('wikipedia:{}'.format(language))
|
||||
if not wiki_name:
|
||||
wiki_name = osm_component.get('wikipedia')
|
||||
if wiki_name:
|
||||
if (language not in (UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE) and wiki_name.lower().startswith(six.u('{}:'.format(language))) or cls.generic_wiki_name_regex.match(wiki_name)):
|
||||
wiki_name = wiki_name.split(six.u(':'), 1)[-1]
|
||||
|
||||
norm_name = safe_decode(name).strip().lower()
|
||||
|
||||
if not wiki_name and language in (UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE):
|
||||
for k, v in six.iteritems(osm_component):
|
||||
if k.startswith('wikipedia:') and safe_decode(v).strip().lower() == norm_name:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
elif not wiki_name:
|
||||
return False
|
||||
|
||||
return norm_name == safe_decode(wiki_name).strip().lower()
|
||||
|
||||
def neighborhood_components(self, latitude, longitude):
|
||||
return self.neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True)
|
||||
|
||||
@@ -1370,7 +1398,8 @@ class AddressComponents(object):
|
||||
return False
|
||||
|
||||
def expanded(self, address_components, latitude, longitude, language=None,
|
||||
dropout_places=True, population=None, population_from_city=False,
|
||||
dropout_places=True, population=None,
|
||||
population_from_city=False, check_city_wikipedia=False,
|
||||
add_sub_building_components=True, hyphenation=True,
|
||||
num_floors=None, num_basements=None, zone=None,
|
||||
osm_components=None):
|
||||
@@ -1464,19 +1493,25 @@ class AddressComponents(object):
|
||||
# Population of the city helps us determine if the city can be used
|
||||
# on its own like "Seattle" or "New York" vs. smaller cities like
|
||||
# have to be qualified with a state, country, etc.
|
||||
if population is None and population_from_city:
|
||||
tagged = self.categorized_osm_components(country, osm_components)
|
||||
for props, component in (tagged or []):
|
||||
if component == AddressFormatter.CITY and 'population' in props:
|
||||
try:
|
||||
population = int(props['population'])
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
unambiguous_city = False
|
||||
|
||||
if population is None and population_from_city:
|
||||
population = 0
|
||||
tagged = self.categorized_osm_components(country, osm_components)
|
||||
|
||||
for props, component in (tagged or []):
|
||||
if component == AddressFormatter.CITY:
|
||||
if self.unambiguous_wikipedia(component, language):
|
||||
unambiguous_city = True
|
||||
|
||||
if 'population' in props:
|
||||
try:
|
||||
population = int(props['population'])
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
# Perform dropout on places
|
||||
address_components = place_config.dropout_components(address_components, all_osm_components, country=country, population=population)
|
||||
address_components = place_config.dropout_components(address_components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city)
|
||||
|
||||
self.drop_invalid_components(address_components, country)
|
||||
|
||||
|
||||
@@ -427,13 +427,16 @@ class OpenAddressesFormatter(object):
|
||||
# very small and the place name shouldn't be used unqualified (i.e. needs information
|
||||
# like state name to disambiguate it)
|
||||
population = 0
|
||||
unambiguous_city = False
|
||||
if add_osm_boundaries or AddressFormatter.CITY not in components:
|
||||
osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude)
|
||||
self.components.add_admin_boundaries(components, osm_components, country, language)
|
||||
categorized = self.components.categorized_osm_components(country, osm_components)
|
||||
for component, label in categorized:
|
||||
if label == AddressFormatter.CITY and 'population' in component:
|
||||
population = component['population']
|
||||
if label == AddressFormatter.CITY:
|
||||
unambiguous_city = self.components.unambiguous_wikipedia(component, language)
|
||||
if 'population' in component:
|
||||
population = component['population']
|
||||
break
|
||||
|
||||
if AddressFormatter.CITY not in components and city_replacements:
|
||||
@@ -460,7 +463,7 @@ class OpenAddressesFormatter(object):
|
||||
|
||||
# Component dropout
|
||||
all_osm_components = osm_components + neighborhood_components
|
||||
components = place_config.dropout_components(components, all_osm_components, country=country, population=population)
|
||||
components = place_config.dropout_components(components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city)
|
||||
|
||||
formatted = self.formatter.format_address(components, country, language=language,
|
||||
minimal_only=False, tag_components=tag_components)
|
||||
|
||||
@@ -920,7 +920,7 @@ class OSMAddressFormatter(object):
|
||||
address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, language=language or namespaced_language,
|
||||
num_floors=num_floors, num_basements=num_basements,
|
||||
zone=zone, add_sub_building_components=add_sub_building_components,
|
||||
population_from_city=True, osm_components=osm_components)
|
||||
population_from_city=True, check_city_wikipedia=True, osm_components=osm_components)
|
||||
|
||||
languages = list(country_languages[country])
|
||||
venue_names = self.venue_names(tags, languages) or []
|
||||
|
||||
@@ -121,8 +121,8 @@ class PlaceConfig(object):
|
||||
|
||||
return random.random() < probability
|
||||
|
||||
def include_component(self, component, containing_ids, country=None, population=None, check_population=True):
|
||||
if check_population:
|
||||
def include_component(self, component, containing_ids, country=None, population=None, check_population=True, unambiguous_city=False):
|
||||
if check_population and not unambiguous_city:
|
||||
population_exceptions = self.get_property(('components', component, 'population'), country=country, default=None)
|
||||
if population_exceptions and self.include_by_population_exceptions(population_exceptions, population=population or 0):
|
||||
return True
|
||||
@@ -143,7 +143,7 @@ class PlaceConfig(object):
|
||||
address_components.pop(c)
|
||||
component_bitset ^= ComponentDependencies.component_bit_values[c]
|
||||
|
||||
def dropout_components(self, components, boundaries=(), country=None, population=None):
|
||||
def dropout_components(self, components, boundaries=(), country=None, population=None, unambiguous_city=False):
|
||||
containing_ids = set()
|
||||
|
||||
for boundary in boundaries:
|
||||
@@ -172,7 +172,7 @@ class PlaceConfig(object):
|
||||
city_replacements = set(self.get_property(('city_replacements', ), country=country))
|
||||
|
||||
for component in admin_components:
|
||||
include = self.include_component(component, containing_ids, country=country, population=population)
|
||||
include = self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city)
|
||||
|
||||
if not include and component not in city_replacements:
|
||||
# Note: this check is for cities that have the same name as their admin
|
||||
@@ -201,7 +201,7 @@ class PlaceConfig(object):
|
||||
if values is not None:
|
||||
value = weighted_choice(values, probs)
|
||||
|
||||
if value is not None and component not in components and self.include_component(component, containing_ids, country=country, population=population):
|
||||
if value is not None and component not in components and self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city):
|
||||
new_components[component] = value
|
||||
|
||||
self.drop_invalid_components(new_components, country, original_bitset=original_bitset)
|
||||
|
||||
Reference in New Issue
Block a user