[parser] Fixing config keys so OSM streets/venues get abbreviated. Selecting namespaced address fields in cases like Brussels or Hong Kong where everything is bilingual. Adding the ability to pass a known language into address component expansion

This commit is contained in:
Al
2016-05-26 12:05:46 -04:00
parent 206a471732
commit 5daa64faef
2 changed files with 35 additions and 26 deletions

View File

@@ -554,8 +554,8 @@ class AddressComponents(object):
States States
------ ------
Primarily for the US, Canada and Australia, OSM tends to use the abbreviated state name Primarily for the US, Canada and Australia, OSM addr:state tags tend to use the abbreviated
whereas we'd like to include both forms, so wtih some probability, replace the abbreviated state name whereas we'd like to include both forms. With some probability, replace the abbreviated
name with the unabbreviated one e.g. CA => California name with the unabbreviated one e.g. CA => California
''' '''
address_state = address_components.get(AddressFormatter.STATE) address_state = address_components.get(AddressFormatter.STATE)
@@ -1029,7 +1029,7 @@ class AddressComponents(object):
return True return True
return False return False
def expanded(self, address_components, latitude, longitude, def expanded(self, address_components, latitude, longitude, language=None,
dropout_places=True, add_sub_building_components=True, dropout_places=True, add_sub_building_components=True,
num_floors=None, num_basements=None, zone=None): num_floors=None, num_basements=None, zone=None):
''' '''
@@ -1053,11 +1053,10 @@ class AddressComponents(object):
if not (country and candidate_languages): if not (country and candidate_languages):
return None, None, None return None, None, None
language = None
more_than_one_official_language = len(candidate_languages) > 1 more_than_one_official_language = len(candidate_languages) > 1
language = self.address_language(address_components, candidate_languages) if not language:
language = self.address_language(address_components, candidate_languages)
non_local_language = self.non_local_language() non_local_language = self.non_local_language()
# If a country was already specified # If a country was already specified

View File

@@ -122,28 +122,22 @@ class OSMAddressFormatter(object):
self.config = yaml.load(open(OSM_PARSER_DATA_DEFAULT_CONFIG)) self.config = yaml.load(open(OSM_PARSER_DATA_DEFAULT_CONFIG))
self.formatter = AddressFormatter() self.formatter = AddressFormatter()
def pick_language(self, osm_tags, candidate_languages): def namespaced_language(self, tags, candidate_languages):
language = None language = None
pick_namespaced_language_prob = float(nested_get(self.config, ('languages', 'pick_namespaced_language_probability'), default=0.0)) pick_namespaced_language_prob = float(nested_get(self.config, ('languages', 'pick_namespaced_language_probability')))
if len(candidate_languages) == 1: if len(candidate_languages) > 1:
language = candidate_languages[0]['lang'] street = tags.get('addr:street', None)
else:
street = osm_tags.get('addr:street', None)
namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in osm_tags] namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in tags]
if street is not None and not namespaced: if namespaced and random.random() < pick_namespaced_language_prob:
language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages])
elif namespaced and random.random() < pick_namespaced_language_prob:
language = random.choice(namespaced) language = random.choice(namespaced)
lang_suffix = ':{}'.format(language) lang_suffix = ':{}'.format(language)
for k in osm_tags: for k in tags:
if k.startswith('addr:') and k.endswith(lang_suffix): if k.startswith('addr:') and k.endswith(lang_suffix):
osm_tags[k.rstrip(lang_suffix)] = osm_tags[k] tags[k.rstrip(lang_suffix)] = tags[k]
else:
language = UNKNOWN_LANGUAGE
return language return language
@@ -195,8 +189,8 @@ class OSMAddressFormatter(object):
to capturing some non-standard abbreviations/surface forms which may be to capturing some non-standard abbreviations/surface forms which may be
missing or sparse in OSM. missing or sparse in OSM.
''' '''
abbreviate_prob = float(nested_get(self.config, ('street', 'abbreviate_probability'), default=0.0)) abbreviate_prob = float(nested_get(self.config, ('streets', 'abbreviate_probability'), default=0.0))
separate_prob = float(nested_get(self.config, ('street', 'separate_probability'), default=0.0)) separate_prob = float(nested_get(self.config, ('streets', 'separate_probability'), default=0.0))
return abbreviate(street_and_synonyms_gazetteer, street, language, return abbreviate(street_and_synonyms_gazetteer, street, language,
abbreviate_prob=abbreviate_prob, separate_prob=separate_prob) abbreviate_prob=abbreviate_prob, separate_prob=separate_prob)
@@ -212,8 +206,8 @@ class OSMAddressFormatter(object):
to capturing some non-standard abbreviations/surface forms which may be to capturing some non-standard abbreviations/surface forms which may be
missing or sparse in OSM. missing or sparse in OSM.
''' '''
abbreviate_prob = float(nested_get(self.config, ('venue', 'abbreviate_probability'), default=0.0)) abbreviate_prob = float(nested_get(self.config, ('venues', 'abbreviate_probability'), default=0.0))
separate_prob = float(nested_get(self.config, ('venue', 'separate_probability'), default=0.0)) separate_prob = float(nested_get(self.config, ('venues', 'separate_probability'), default=0.0))
return abbreviate(names_gazetteer, name, language, return abbreviate(names_gazetteer, name, language,
abbreviate_prob=abbreviate_prob, separate_prob=separate_prob) abbreviate_prob=abbreviate_prob, separate_prob=separate_prob)
@@ -373,8 +367,18 @@ class OSMAddressFormatter(object):
except Exception: except Exception:
return None, None, None return None, None, None
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
if not (country and candidate_languages):
return None, None, None
combined_street = self.combine_street_name(tags) combined_street = self.combine_street_name(tags)
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
if not (country and candidate_languages):
return None, None, None
namespaced_language = self.namespaced_language(tags, candidate_languages)
revised_tags = self.normalize_address_components(tags) revised_tags = self.normalize_address_components(tags)
num_floors = None num_floors = None
@@ -390,7 +394,7 @@ class OSMAddressFormatter(object):
if subdivision_components: if subdivision_components:
zone = self.zone(subdivision_components) zone = self.zone(subdivision_components)
address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, language=namespaced_language,
num_floors=num_floors, num_basements=num_basements, num_floors=num_floors, num_basements=num_basements,
zone=zone) zone=zone)
@@ -450,11 +454,17 @@ class OSMAddressFormatter(object):
except Exception: except Exception:
return None, None, None return None, None, None
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
if not (country and candidate_languages):
return None, None, None
namespaced_language = self.namespaced_language(tags, candidate_languages)
revised_tags = self.normalize_address_components(tags) revised_tags = self.normalize_address_components(tags)
admin_dropout_prob = float(nested_get(self.config, ('limited', 'admin_dropout_prob'), default=0.0)) admin_dropout_prob = float(nested_get(self.config, ('limited', 'admin_dropout_prob'), default=0.0))
address_components, country, language = self.components.limited(revised_tags, latitude, longitude) address_components, country, language = self.components.limited(revised_tags, latitude, longitude, language=namespaced_language)
if not address_components: if not address_components:
return None, None, None return None, None, None