From bbcb6444c80861c823efb636d29347fb1feb82a9 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Fri, 10 Feb 2017 23:57:20 -0500
Subject: [PATCH] [addresses] add strip_components method which simply removes
 the names of OSM components from a string (for e.g. postal codes)

---
 scripts/geodata/addresses/components.py | 33 +++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py
index 6e63d4e9..2d8263c5 100644
--- a/scripts/geodata/addresses/components.py
+++ b/scripts/geodata/addresses/components.py
@@ -408,6 +408,39 @@ class AddressComponents(object):
 
         return names, components
 
+    def strip_components(self, name, osm_components, country, languages):
+        if not name or not osm_components:
+            return name
+
+        tokens = tokenize(name)
+
+        tokens_lower = normalized_tokens(name, string_options=NORMALIZE_STRING_LOWERCASE,
+                                         token_options=TOKEN_OPTIONS_DROP_PERIODS)
+
+        names, components = self.place_names_and_components(name, osm_components, country=country, languages=languages)
+
+        phrase_filter = PhraseFilter([(n, '') for n in names])
+
+        stripped = []
+
+        for is_phrase, tokens, value in phrases:
+            if not is_phrase:
+                t, c = tokens
+                if stripped and c not in (token_types.IDEOGRAPHIC_CHAR, token_types.IDEOGRAPHIC_NUMBER):
+                    stripped.append(u' ')
+                stripped.append(t)
+
+        name = u''.join(stripped)
+
+        if self.parens_regex.search(name):
+            name = self.parens_regex.sub(six.u(''), name).strip()
+
+        # If the name contains a comma, stop and only use the phrase before the comma
+        if ',' in name:
+            return name.split(',', 1)[0].strip()
+
+        return name
+
     parens_regex = re.compile('\(.*?\)')
 
     def normalized_place_name(self, name, tag, osm_components, country=None, languages=None, phrase_from_component=False):