[parser] Fixing config keys so OSM streets/venues get abbreviated. Selecting namespaced address fields in cases like Brussels or Hong Kong where everything is bilingual. Adding the ability to pass a known language into address component expansion
This commit is contained in:
@@ -554,8 +554,8 @@ class AddressComponents(object):
|
|||||||
States
|
States
|
||||||
------
|
------
|
||||||
|
|
||||||
Primarily for the US, Canada and Australia, OSM tends to use the abbreviated state name
|
Primarily for the US, Canada and Australia, OSM addr:state tags tend to use the abbreviated
|
||||||
whereas we'd like to include both forms, so wtih some probability, replace the abbreviated
|
state name whereas we'd like to include both forms. With some probability, replace the abbreviated
|
||||||
name with the unabbreviated one e.g. CA => California
|
name with the unabbreviated one e.g. CA => California
|
||||||
'''
|
'''
|
||||||
address_state = address_components.get(AddressFormatter.STATE)
|
address_state = address_components.get(AddressFormatter.STATE)
|
||||||
@@ -1029,7 +1029,7 @@ class AddressComponents(object):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def expanded(self, address_components, latitude, longitude,
|
def expanded(self, address_components, latitude, longitude, language=None,
|
||||||
dropout_places=True, add_sub_building_components=True,
|
dropout_places=True, add_sub_building_components=True,
|
||||||
num_floors=None, num_basements=None, zone=None):
|
num_floors=None, num_basements=None, zone=None):
|
||||||
'''
|
'''
|
||||||
@@ -1053,11 +1053,10 @@ class AddressComponents(object):
|
|||||||
if not (country and candidate_languages):
|
if not (country and candidate_languages):
|
||||||
return None, None, None
|
return None, None, None
|
||||||
|
|
||||||
language = None
|
|
||||||
|
|
||||||
more_than_one_official_language = len(candidate_languages) > 1
|
more_than_one_official_language = len(candidate_languages) > 1
|
||||||
|
|
||||||
language = self.address_language(address_components, candidate_languages)
|
if not language:
|
||||||
|
language = self.address_language(address_components, candidate_languages)
|
||||||
|
|
||||||
non_local_language = self.non_local_language()
|
non_local_language = self.non_local_language()
|
||||||
# If a country was already specified
|
# If a country was already specified
|
||||||
|
|||||||
@@ -122,28 +122,22 @@ class OSMAddressFormatter(object):
|
|||||||
self.config = yaml.load(open(OSM_PARSER_DATA_DEFAULT_CONFIG))
|
self.config = yaml.load(open(OSM_PARSER_DATA_DEFAULT_CONFIG))
|
||||||
self.formatter = AddressFormatter()
|
self.formatter = AddressFormatter()
|
||||||
|
|
||||||
def pick_language(self, osm_tags, candidate_languages):
|
def namespaced_language(self, tags, candidate_languages):
|
||||||
language = None
|
language = None
|
||||||
|
|
||||||
pick_namespaced_language_prob = float(nested_get(self.config, ('languages', 'pick_namespaced_language_probability'), default=0.0))
|
pick_namespaced_language_prob = float(nested_get(self.config, ('languages', 'pick_namespaced_language_probability')))
|
||||||
|
|
||||||
if len(candidate_languages) == 1:
|
if len(candidate_languages) > 1:
|
||||||
language = candidate_languages[0]['lang']
|
street = tags.get('addr:street', None)
|
||||||
else:
|
|
||||||
street = osm_tags.get('addr:street', None)
|
|
||||||
|
|
||||||
namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in osm_tags]
|
namespaced = [l['lang'] for l in candidate_languages if 'addr:street:{}'.format(l['lang']) in tags]
|
||||||
|
|
||||||
if street is not None and not namespaced:
|
if namespaced and random.random() < pick_namespaced_language_prob:
|
||||||
language = disambiguate_language(street, [(l['lang'], l['default']) for l in candidate_languages])
|
|
||||||
elif namespaced and random.random() < pick_namespaced_language_prob:
|
|
||||||
language = random.choice(namespaced)
|
language = random.choice(namespaced)
|
||||||
lang_suffix = ':{}'.format(language)
|
lang_suffix = ':{}'.format(language)
|
||||||
for k in osm_tags:
|
for k in tags:
|
||||||
if k.startswith('addr:') and k.endswith(lang_suffix):
|
if k.startswith('addr:') and k.endswith(lang_suffix):
|
||||||
osm_tags[k.rstrip(lang_suffix)] = osm_tags[k]
|
tags[k.rstrip(lang_suffix)] = tags[k]
|
||||||
else:
|
|
||||||
language = UNKNOWN_LANGUAGE
|
|
||||||
|
|
||||||
return language
|
return language
|
||||||
|
|
||||||
@@ -195,8 +189,8 @@ class OSMAddressFormatter(object):
|
|||||||
to capturing some non-standard abbreviations/surface forms which may be
|
to capturing some non-standard abbreviations/surface forms which may be
|
||||||
missing or sparse in OSM.
|
missing or sparse in OSM.
|
||||||
'''
|
'''
|
||||||
abbreviate_prob = float(nested_get(self.config, ('street', 'abbreviate_probability'), default=0.0))
|
abbreviate_prob = float(nested_get(self.config, ('streets', 'abbreviate_probability'), default=0.0))
|
||||||
separate_prob = float(nested_get(self.config, ('street', 'separate_probability'), default=0.0))
|
separate_prob = float(nested_get(self.config, ('streets', 'separate_probability'), default=0.0))
|
||||||
|
|
||||||
return abbreviate(street_and_synonyms_gazetteer, street, language,
|
return abbreviate(street_and_synonyms_gazetteer, street, language,
|
||||||
abbreviate_prob=abbreviate_prob, separate_prob=separate_prob)
|
abbreviate_prob=abbreviate_prob, separate_prob=separate_prob)
|
||||||
@@ -212,8 +206,8 @@ class OSMAddressFormatter(object):
|
|||||||
to capturing some non-standard abbreviations/surface forms which may be
|
to capturing some non-standard abbreviations/surface forms which may be
|
||||||
missing or sparse in OSM.
|
missing or sparse in OSM.
|
||||||
'''
|
'''
|
||||||
abbreviate_prob = float(nested_get(self.config, ('venue', 'abbreviate_probability'), default=0.0))
|
abbreviate_prob = float(nested_get(self.config, ('venues', 'abbreviate_probability'), default=0.0))
|
||||||
separate_prob = float(nested_get(self.config, ('venue', 'separate_probability'), default=0.0))
|
separate_prob = float(nested_get(self.config, ('venues', 'separate_probability'), default=0.0))
|
||||||
|
|
||||||
return abbreviate(names_gazetteer, name, language,
|
return abbreviate(names_gazetteer, name, language,
|
||||||
abbreviate_prob=abbreviate_prob, separate_prob=separate_prob)
|
abbreviate_prob=abbreviate_prob, separate_prob=separate_prob)
|
||||||
@@ -373,8 +367,18 @@ class OSMAddressFormatter(object):
|
|||||||
except Exception:
|
except Exception:
|
||||||
return None, None, None
|
return None, None, None
|
||||||
|
|
||||||
|
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
|
||||||
|
if not (country and candidate_languages):
|
||||||
|
return None, None, None
|
||||||
|
|
||||||
combined_street = self.combine_street_name(tags)
|
combined_street = self.combine_street_name(tags)
|
||||||
|
|
||||||
|
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
|
||||||
|
if not (country and candidate_languages):
|
||||||
|
return None, None, None
|
||||||
|
|
||||||
|
namespaced_language = self.namespaced_language(tags, candidate_languages)
|
||||||
|
|
||||||
revised_tags = self.normalize_address_components(tags)
|
revised_tags = self.normalize_address_components(tags)
|
||||||
|
|
||||||
num_floors = None
|
num_floors = None
|
||||||
@@ -390,7 +394,7 @@ class OSMAddressFormatter(object):
|
|||||||
if subdivision_components:
|
if subdivision_components:
|
||||||
zone = self.zone(subdivision_components)
|
zone = self.zone(subdivision_components)
|
||||||
|
|
||||||
address_components, country, language = self.components.expanded(revised_tags, latitude, longitude,
|
address_components, country, language = self.components.expanded(revised_tags, latitude, longitude, language=namespaced_language,
|
||||||
num_floors=num_floors, num_basements=num_basements,
|
num_floors=num_floors, num_basements=num_basements,
|
||||||
zone=zone)
|
zone=zone)
|
||||||
|
|
||||||
@@ -450,11 +454,17 @@ class OSMAddressFormatter(object):
|
|||||||
except Exception:
|
except Exception:
|
||||||
return None, None, None
|
return None, None, None
|
||||||
|
|
||||||
|
country, candidate_languages, language_props = self.language_rtree.country_and_languages(latitude, longitude)
|
||||||
|
if not (country and candidate_languages):
|
||||||
|
return None, None, None
|
||||||
|
|
||||||
|
namespaced_language = self.namespaced_language(tags, candidate_languages)
|
||||||
|
|
||||||
revised_tags = self.normalize_address_components(tags)
|
revised_tags = self.normalize_address_components(tags)
|
||||||
|
|
||||||
admin_dropout_prob = float(nested_get(self.config, ('limited', 'admin_dropout_prob'), default=0.0))
|
admin_dropout_prob = float(nested_get(self.config, ('limited', 'admin_dropout_prob'), default=0.0))
|
||||||
|
|
||||||
address_components, country, language = self.components.limited(revised_tags, latitude, longitude)
|
address_components, country, language = self.components.limited(revised_tags, latitude, longitude, language=namespaced_language)
|
||||||
|
|
||||||
if not address_components:
|
if not address_components:
|
||||||
return None, None, None
|
return None, None, None
|
||||||
|
|||||||
Reference in New Issue
Block a user