From 0ab3b13b75ce88c2b191fc7102ddbdaaf08a56c5 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Tue, 2 Aug 2016 16:25:39 -0400
Subject: [PATCH] [osm] Remove hanging commas, slashes, etc. Implementing a
 stricter rule for user-specified tags (not reverse geocoded) so that if they
 contain an unknown phrase followed by an unknown boundary phrase, we delete
 that tag and fall back to the reverse geocoded components. Moving CLDR
 country tagging to later in the process since those are known correct names.

---
 scripts/geodata/addresses/components.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py
index 804fa28d..929f2a58 100644
--- a/scripts/geodata/addresses/components.py
+++ b/scripts/geodata/addresses/components.py
@@ -355,16 +355,18 @@ class AddressComponents(object):
                 whitespace = not any((c in (token_types.IDEOGRAPHIC_CHAR, token_types.IDEOGRAPHIC_NUMBER) for t, c in phrase_tokens))
                 join_phrase = six.u(' ') if whitespace else six.u('')
 
-                if num_phrases > 0:
+                if num_phrases > 0 and total_tokens > 0:
+                    # Remove hanging comma, slash, etc.
+                    last_token, last_class = tokens[total_tokens - 1]
+                    if last_class in token_types.NON_ALPHANUMERIC_TOKEN_TYPES:
+                        total_tokens -= 1
                     # Return phrase with original capitalization
                     return join_phrase.join([t for t, c in tokens[:total_tokens]])
                 elif num_phrases == 0 and total_tokens > 0:
-                    phrase = join_phrase.join([t for t, c in phrase_tokens])
-                    if tag not in components.get(phrase, set()):
-                        return None
-                elif num_phrases == 0:
-                    current_phrase_tokens = tokens_lower[current_phrase_start:current_phrase_start + current_phrase_len]
-                    current_phrase = join_phrase.join([t for t, c in current_phrase_tokens])
+                    # We're only talking about addr:city tags, etc. so default to
+                    # the reverse geocoded components (better names) if we encounter
+                    # an unknown phrase followed by a containing boundary phrase.
+                    return None
 
                 current_phrase_start = total_tokens
                 current_phrase_len = len(phrase_tokens)
@@ -384,7 +386,9 @@ class AddressComponents(object):
 
         # If the name contains a comma, stop and only use the phrase before the comma
         if ',' in name:
-            return name.split(',')[0].strip()
+            return name.split(',', 1)[0].strip()
+        elif '/' in name:
+            return name.split('/', 1)[0].strip()
 
         return name
 
@@ -1171,8 +1175,6 @@ class AddressComponents(object):
             language = self.address_language(address_components, candidate_languages)
 
         non_local_language = self.non_local_language()
-        # If a country was already specified
-        self.replace_country_name(address_components, country, non_local_language or language)
 
         address_state = self.state_name(address_components, country, language, non_local_language=non_local_language)
         if address_state:
@@ -1188,6 +1190,9 @@ class AddressComponents(object):
 
         self.normalize_place_names(address_components, all_osm_components, country=country, languages=all_languages)
 
+        # If a country was already specified
+        self.replace_country_name(address_components, country, non_local_language or language)
+
         self.add_admin_boundaries(address_components, osm_components, country, language,
                                   non_local_language=non_local_language,
                                   language_suffix=language_suffix)