From 5daa64faef208e37f8eb4525bd1de938c30e21b3 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 26 May 2016 12:05:46 -0400 Subject: [PATCH] [parser] Fixing config keys so OSM streets/venues get abbreviated. Selecting namespaced address fields in cases like Brussels or Hong Kong where everything is bilingual. Adding the ability to pass a known language into address component expansion --- scripts/geodata/addresses/components.py | 11 +++--- scripts/geodata/osm/formatter.py | 50 +++++++++++++++---------- 2 files changed, 35 insertions(+), 26 deletions(-) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index e3629a0c..f03763e5 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -554,8 +554,8 @@ class AddressComponents(object): States ------ - Primarily for the US, Canada and Australia, OSM tends to use the abbreviated state name - whereas we'd like to include both forms, so wtih some probability, replace the abbreviated + Primarily for the US, Canada and Australia, OSM addr:state tags tend to use the abbreviated + state name whereas we'd like to include both forms. With some probability, replace the abbreviated name with the unabbreviated one e.g. CA => California ''' address_state = address_components.get(AddressFormatter.STATE) @@ -1029,7 +1029,7 @@ class AddressComponents(object): return True return False - def expanded(self, address_components, latitude, longitude, + def expanded(self, address_components, latitude, longitude, language=None, dropout_places=True, add_sub_building_components=True, num_floors=None, num_basements=None, zone=None): ''' @@ -1053,11 +1053,10 @@ class AddressComponents(object): if not (country and candidate_languages): return None, None, None - language = None - more_than_one_official_language = len(candidate_languages) > 1 - language = self.address_language(address_components, candidate_languages) + if not language: + language = self.address_language(address_components, candidate_languages) non_local_language = self.non_local_language() # If a country was already specified diff --git a/scripts/geodata/osm/formatter.py b/scripts/geodata/osm/formatter.py index 3f2d3281..6c34bbf5 100644 --- a/scripts/geodata/osm/formatter.py +++ b/scripts/geodata/osm/formatter.py @@ -122,28 +122,22 @@ class OSMAddressFormatter(object): self.config = yaml.load(open(OSM_PARSER_DATA_DEFAULT_CONFIG)) self.formatter = AddressFormatter() - def pick_language(self, osm_tags, candidate_languages): + def namespaced_language(self, tags, candidate_languages): language = None - pick_namespaced_language_prob = float(nested_get(self.config, ('languages', 'pick_namespaced_language_probability'), default=0.0)) + pick_namespaced_language_prob = float(nested_get(self.config, ('languages', 'pick_namespaced_language_probability'))) - if len(candidate_languages) == 1: - language = candidate_languages[0]['lang'] - else: - street = osm_tags.get('addr:street', None) + if len(candidate_languages) > 1: + street = tags.get('addr:street', None) - namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in osm_tags] + namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in tags] - if street is not None and not namespaced: - language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages]) - elif namespaced and random.random() < pick_namespaced_language_prob: + if namespaced and random.random() < pick_namespaced_language_prob: language = random.choice(namespaced) lang_suffix = ':{}'.format(language) - for k in osm_tags: + for k in tags: if k.startswith('addr:') and k.endswith(lang_suffix): - osm_tags[k.rstrip(lang_suffix)] = osm_tags[k] - else: - language = UNKNOWN_LANGUAGE + tags[k.rstrip(lang_suffix)] = tags[k] return language @@ -195,8 +189,8 @@ class OSMAddressFormatter(object): to capturing some non-standard abbreviations/surface forms which may be missing or sparse in OSM. ''' - abbreviate_prob = float(nested_get(self.config, ('street', 'abbreviate_probability'), default=0.0)) - separate_prob = float(nested_get(self.config, ('street', 'separate_probability'), default=0.0)) + abbreviate_prob = float(nested_get(self.config, ('streets', 'abbreviate_probability'), default=0.0)) + separate_prob = float(nested_get(self.config, ('streets', 'separate_probability'), default=0.0)) return abbreviate(street_and_synonyms_gazetteer, street, language, abbreviate_prob=abbreviate_prob, separate_prob=separate_prob) @@ -212,8 +206,8 @@ class OSMAddressFormatter(object): to capturing some non-standard abbreviations/surface forms which may be missing or sparse in OSM. ''' - abbreviate_prob = float(nested_get(self.config, ('venue', 'abbreviate_probability'), default=0.0)) - separate_prob = float(nested_get(self.config, ('venue', 'separate_probability'), default=0.0)) + abbreviate_prob = float(nested_get(self.config, ('venues', 'abbreviate_probability'), default=0.0)) + separate_prob = float(nested_get(self.config, ('venues', 'separate_probability'), default=0.0)) return abbreviate(names_gazetteer, name, language, abbreviate_prob=abbreviate_prob, separate_prob=separate_prob) @@ -373,8 +367,18 @@ class OSMAddressFormatter(object): except Exception: return None, None, None + country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) + if not (country and candidate_languages): + return None, None, None + combined_street = self.combine_street_name(tags) + country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) + if not (country and candidate_languages): + return None, None, None + + namespaced_language = self.namespaced_language(tags, candidate_languages) + revised_tags = self.normalize_address_components(tags) num_floors = None @@ -390,7 +394,7 @@ class OSMAddressFormatter(object): if subdivision_components: zone = self.zone(subdivision_components) - address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, + address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, language=namespaced_language, num_floors=num_floors, num_basements=num_basements, zone=zone) @@ -450,11 +454,17 @@ class OSMAddressFormatter(object): except Exception: return None, None, None + country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude) + if not (country and candidate_languages): + return None, None, None + + namespaced_language = self.namespaced_language(tags, candidate_languages) + revised_tags = self.normalize_address_components(tags) admin_dropout_prob = float(nested_get(self.config, ('limited', 'admin_dropout_prob'), default=0.0)) - address_components, country, language = self.components.limited(revised_tags, latitude, longitude) + address_components, country, language = self.components.limited(revised_tags, latitude, longitude, language=namespaced_language) if not address_components: return None, None, None