From f69e63e311b596f4e6751236b9e7dd2a7f475b38 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 28 Aug 2016 13:59:28 -0400 Subject: [PATCH] [openaddresses] Place component dropout. Obtain population from OSM components when we have them but otherwise assume it's actually 0 (not unknown), that way the more conservative probabilities will be used i.e. state names will be included more often rather than unqualified cities --- scripts/geodata/openaddresses/formatter.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index 76e873b5..a4e05173 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -18,6 +18,7 @@ from geodata.countries.names import country_names from geodata.encoding import safe_decode, safe_encode from geodata.language_id.disambiguation import UNKNOWN_LANGUAGE from geodata.math.sampling import cdf, weighted_choice +from geodata.places.config import place_config from geodata.text.utils import is_numeric, is_numeric_strict from geodata.csv_utils import tsv_string, unicode_csv_reader @@ -365,15 +366,27 @@ class OpenAddressesFormatter(object): # This is expensive, so only turn on for files that don't supply their own city names # or for which those names are flawed + osm_components = [] + population = None if add_osm_boundaries or AddressFormatter.CITY not in components: osm_components = self.components.osm_reverse_geocoded_components(latitude, longitude) self.components.add_admin_boundaries(components, osm_components, country, language) + categorized = self.components.categorized_osm_components(country, osm_components) + for component, label in categorized: + if label == AddressFormatter.CITY and 'population' in component: + population = component['population'] + break # The neighborhood index is cheaper so can turn on for whole countries + neighborhood_components = [] if add_osm_neighborhoods: neighborhood_components = self.components.neighborhood_components(latitude, longitude) self.components.add_neighborhoods(components, neighborhood_components) + if add_osm_boundaries or add_osm_neighborhoods: + all_osm_components = osm_components + neighborhood_components + components = place_config.dropout_components(components, all_osm_components, country=country, population=population) + formatted = self.formatter.format_address(components, country, language=language, tag_components=tag_components) yield (language, country, formatted)