[addresses] add strip_components method which simply removes the names of OSM components from a string (for e.g. postal codes)

2017-02-10 23:57:20 -05:00
parent 4e1d7d9373
commit bbcb6444c8
1 changed files with 33 additions and 0 deletions
--- a/scripts/geodata/addresses/components.py
+++ b/scripts/geodata/addresses/components.py
@@ -408,6 +408,39 @@ class AddressComponents(object):

        return names, components

+    def strip_components(self, name, osm_components, country, languages):
+        if not name or not osm_components:
+            return name
+
+        tokens = tokenize(name)
+
+        tokens_lower = normalized_tokens(name, string_options=NORMALIZE_STRING_LOWERCASE,
+                                         token_options=TOKEN_OPTIONS_DROP_PERIODS)
+
+        names, components = self.place_names_and_components(name, osm_components, country=country, languages=languages)
+
+        phrase_filter = PhraseFilter([(n, '') for n in names])
+
+        stripped = []
+
+        for is_phrase, tokens, value in phrases:
+            if not is_phrase:
+                t, c = tokens
+                if stripped and c not in (token_types.IDEOGRAPHIC_CHAR, token_types.IDEOGRAPHIC_NUMBER):
+                    stripped.append(u' ')
+                stripped.append(t)
+
+        name = u''.join(stripped)
+
+        if self.parens_regex.search(name):
+            name = self.parens_regex.sub(six.u(''), name).strip()
+
+        # If the name contains a comma, stop and only use the phrase before the comma
+        if ',' in name:
+            return name.split(',', 1)[0].strip()
+
+        return name
+
    parens_regex = re.compile('\(.*?\)')

    def normalized_place_name(self, name, tag, osm_components, country=None, languages=None, phrase_from_component=False):