[osm] Remove hanging commas, slashes, etc. Implementing a stricter rule for user-specified tags (not reverse geocoded) so that if they contain an unknown phrase followed by an unknown boundary phrase, we delete that tag and fall back to the reverse geocoded components. Moving CLDR country tagging to later in the process since those are known correct names.
This commit is contained in:
@@ -355,16 +355,18 @@ class AddressComponents(object):
|
|||||||
whitespace = not any((c in (token_types.IDEOGRAPHIC_CHAR, token_types.IDEOGRAPHIC_NUMBER) for t, c in phrase_tokens))
|
whitespace = not any((c in (token_types.IDEOGRAPHIC_CHAR, token_types.IDEOGRAPHIC_NUMBER) for t, c in phrase_tokens))
|
||||||
join_phrase = six.u(' ') if whitespace else six.u('')
|
join_phrase = six.u(' ') if whitespace else six.u('')
|
||||||
|
|
||||||
if num_phrases > 0:
|
if num_phrases > 0 and total_tokens > 0:
|
||||||
|
# Remove hanging comma, slash, etc.
|
||||||
|
last_token, last_class = tokens[total_tokens - 1]
|
||||||
|
if last_class in token_types.NON_ALPHANUMERIC_TOKEN_TYPES:
|
||||||
|
total_tokens -= 1
|
||||||
# Return phrase with original capitalization
|
# Return phrase with original capitalization
|
||||||
return join_phrase.join([t for t, c in tokens[:total_tokens]])
|
return join_phrase.join([t for t, c in tokens[:total_tokens]])
|
||||||
elif num_phrases == 0 and total_tokens > 0:
|
elif num_phrases == 0 and total_tokens > 0:
|
||||||
phrase = join_phrase.join([t for t, c in phrase_tokens])
|
# We're only talking about addr:city tags, etc. so default to
|
||||||
if tag not in components.get(phrase, set()):
|
# the reverse geocoded components (better names) if we encounter
|
||||||
return None
|
# an unknown phrase followed by a containing boundary phrase.
|
||||||
elif num_phrases == 0:
|
return None
|
||||||
current_phrase_tokens = tokens_lower[current_phrase_start:current_phrase_start + current_phrase_len]
|
|
||||||
current_phrase = join_phrase.join([t for t, c in current_phrase_tokens])
|
|
||||||
|
|
||||||
current_phrase_start = total_tokens
|
current_phrase_start = total_tokens
|
||||||
current_phrase_len = len(phrase_tokens)
|
current_phrase_len = len(phrase_tokens)
|
||||||
@@ -384,7 +386,9 @@ class AddressComponents(object):
|
|||||||
|
|
||||||
# If the name contains a comma, stop and only use the phrase before the comma
|
# If the name contains a comma, stop and only use the phrase before the comma
|
||||||
if ',' in name:
|
if ',' in name:
|
||||||
return name.split(',')[0].strip()
|
return name.split(',', 1)[0].strip()
|
||||||
|
elif '/' in name:
|
||||||
|
return name.split('/', 1)[0].strip()
|
||||||
|
|
||||||
return name
|
return name
|
||||||
|
|
||||||
@@ -1171,8 +1175,6 @@ class AddressComponents(object):
|
|||||||
language = self.address_language(address_components, candidate_languages)
|
language = self.address_language(address_components, candidate_languages)
|
||||||
|
|
||||||
non_local_language = self.non_local_language()
|
non_local_language = self.non_local_language()
|
||||||
# If a country was already specified
|
|
||||||
self.replace_country_name(address_components, country, non_local_language or language)
|
|
||||||
|
|
||||||
address_state = self.state_name(address_components, country, language, non_local_language=non_local_language)
|
address_state = self.state_name(address_components, country, language, non_local_language=non_local_language)
|
||||||
if address_state:
|
if address_state:
|
||||||
@@ -1188,6 +1190,9 @@ class AddressComponents(object):
|
|||||||
|
|
||||||
self.normalize_place_names(address_components, all_osm_components, country=country, languages=all_languages)
|
self.normalize_place_names(address_components, all_osm_components, country=country, languages=all_languages)
|
||||||
|
|
||||||
|
# If a country was already specified
|
||||||
|
self.replace_country_name(address_components, country, non_local_language or language)
|
||||||
|
|
||||||
self.add_admin_boundaries(address_components, osm_components, country, language,
|
self.add_admin_boundaries(address_components, osm_components, country, language,
|
||||||
non_local_language=non_local_language,
|
non_local_language=non_local_language,
|
||||||
language_suffix=language_suffix)
|
language_suffix=language_suffix)
|
||||||
|
|||||||
Reference in New Issue
Block a user