From cdbc102821476b89d2205a6212dcd67bbe270596 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 25 Nov 2016 13:29:59 -0800 Subject: [PATCH] [boundaries] in addition to population, check if a city has an unambiguous Wikipedia --- scripts/geodata/addresses/components.py | 55 ++++++++++++++++++---- scripts/geodata/openaddresses/formatter.py | 9 ++-- scripts/geodata/osm/formatter.py | 2 +- scripts/geodata/places/config.py | 10 ++-- 4 files changed, 57 insertions(+), 19 deletions(-) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index bcdfd521..2207f3ef 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -909,6 +909,34 @@ class AddressComponents(object): return city + generic_wiki_name_regex = re.compile('^[a-z]{2,3}:') + + @classmethod + def unambiguous_wikipedia(cls, osm_component, language): + name = osm_component.get('name') + if not name: + return False + + wiki_name = osm_component.get('wikipedia:{}'.format(language)) + if not wiki_name: + wiki_name = osm_component.get('wikipedia') + if wiki_name: + if (language not in (UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE) and wiki_name.lower().startswith(six.u('{}:'.format(language))) or cls.generic_wiki_name_regex.match(wiki_name)): + wiki_name = wiki_name.split(six.u(':'), 1)[-1] + + norm_name = safe_decode(name).strip().lower() + + if not wiki_name and language in (UNKNOWN_LANGUAGE, AMBIGUOUS_LANGUAGE): + for k, v in six.iteritems(osm_component): + if k.startswith('wikipedia:') and safe_decode(v).strip().lower() == norm_name: + return True + else: + return False + elif not wiki_name: + return False + + return norm_name == safe_decode(wiki_name).strip().lower() + def neighborhood_components(self, latitude, longitude): return self.neighborhoods_rtree.point_in_poly(latitude, longitude, return_all=True) @@ -1370,7 +1398,8 @@ class AddressComponents(object): return False def expanded(self, address_components, latitude, longitude, language=None, - dropout_places=True, population=None, population_from_city=False, + dropout_places=True, population=None, + population_from_city=False, check_city_wikipedia=False, add_sub_building_components=True, hyphenation=True, num_floors=None, num_basements=None, zone=None, osm_components=None): @@ -1464,19 +1493,25 @@ class AddressComponents(object): # Population of the city helps us determine if the city can be used # on its own like "Seattle" or "New York" vs. smaller cities like # have to be qualified with a state, country, etc. - if population is None and population_from_city: - tagged = self.categorized_osm_components(country, osm_components) - for props, component in (tagged or []): - if component == AddressFormatter.CITY and 'population' in props: - try: - population = int(props['population']) - except (ValueError, TypeError): - continue + unambiguous_city = False + if population is None and population_from_city: population = 0 + tagged = self.categorized_osm_components(country, osm_components) + + for props, component in (tagged or []): + if component == AddressFormatter.CITY: + if self.unambiguous_wikipedia(component, language): + unambiguous_city = True + + if 'population' in props: + try: + population = int(props['population']) + except (ValueError, TypeError): + continue # Perform dropout on places - address_components = place_config.dropout_components(address_components, all_osm_components, country=country, population=population) + address_components = place_config.dropout_components(address_components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city) self.drop_invalid_components(address_components, country) diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index 7c423f3b..8e2f1d00 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -427,13 +427,16 @@ class OpenAddressesFormatter(object): # very small and the place name shouldn't be used unqualified (i.e. needs information # like state name to disambiguate it) population = 0 + unambiguous_city = False if add_osm_boundaries or AddressFormatter.CITY not in components: osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude) self.components.add_admin_boundaries(components, osm_components, country, language) categorized = self.components.categorized_osm_components(country, osm_components) for component, label in categorized: - if label == AddressFormatter.CITY and 'population' in component: - population = component['population'] + if label == AddressFormatter.CITY: + unambiguous_city = self.components.unambiguous_wikipedia(component, language) + if 'population' in component: + population = component['population'] break if AddressFormatter.CITY not in components and city_replacements: @@ -460,7 +463,7 @@ class OpenAddressesFormatter(object): # Component dropout all_osm_components = osm_components + neighborhood_components - components = place_config.dropout_components(components, all_osm_components, country=country, population=population) + components = place_config.dropout_components(components, all_osm_components, country=country, population=population, unambiguous_city=unambiguous_city) formatted = self.formatter.format_address(components, country, language=language, minimal_only=False, tag_components=tag_components) diff --git a/scripts/geodata/osm/formatter.py b/scripts/geodata/osm/formatter.py index 5c5d7b77..7fe1e29d 100644 --- a/scripts/geodata/osm/formatter.py +++ b/scripts/geodata/osm/formatter.py @@ -920,7 +920,7 @@ class OSMAddressFormatter(object): address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, language=language or namespaced_language, num_floors=num_floors, num_basements=num_basements, zone=zone, add_sub_building_components=add_sub_building_components, - population_from_city=True, osm_components=osm_components) + population_from_city=True, check_city_wikipedia=True, osm_components=osm_components) languages = list(country_languages[country]) venue_names = self.venue_names(tags, languages) or [] diff --git a/scripts/geodata/places/config.py b/scripts/geodata/places/config.py index 37e28b42..9d617e31 100644 --- a/scripts/geodata/places/config.py +++ b/scripts/geodata/places/config.py @@ -121,8 +121,8 @@ class PlaceConfig(object): return random.random() < probability - def include_component(self, component, containing_ids, country=None, population=None, check_population=True): - if check_population: + def include_component(self, component, containing_ids, country=None, population=None, check_population=True, unambiguous_city=False): + if check_population and not unambiguous_city: population_exceptions = self.get_property(('components', component, 'population'), country=country, default=None) if population_exceptions and self.include_by_population_exceptions(population_exceptions, population=population or 0): return True @@ -143,7 +143,7 @@ class PlaceConfig(object): address_components.pop(c) component_bitset ^= ComponentDependencies.component_bit_values[c] - def dropout_components(self, components, boundaries=(), country=None, population=None): + def dropout_components(self, components, boundaries=(), country=None, population=None, unambiguous_city=False): containing_ids = set() for boundary in boundaries: @@ -172,7 +172,7 @@ class PlaceConfig(object): city_replacements = set(self.get_property(('city_replacements', ), country=country)) for component in admin_components: - include = self.include_component(component, containing_ids, country=country, population=population) + include = self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city) if not include and component not in city_replacements: # Note: this check is for cities that have the same name as their admin @@ -201,7 +201,7 @@ class PlaceConfig(object): if values is not None: value = weighted_choice(values, probs) - if value is not None and component not in components and self.include_component(component, containing_ids, country=country, population=population): + if value is not None and component not in components and self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city): new_components[component] = value self.drop_invalid_components(new_components, country, original_bitset=original_bitset)