From bbcb6444c80861c823efb636d29347fb1feb82a9 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 10 Feb 2017 23:57:20 -0500 Subject: [PATCH] [addresses] add strip_components method which simply removes the names of OSM components from a string (for e.g. postal codes) --- scripts/geodata/addresses/components.py | 33 +++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 6e63d4e9..2d8263c5 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -408,6 +408,39 @@ class AddressComponents(object): return names, components + def strip_components(self, name, osm_components, country, languages): + if not name or not osm_components: + return name + + tokens = tokenize(name) + + tokens_lower = normalized_tokens(name, string_options=NORMALIZE_STRING_LOWERCASE, + token_options=TOKEN_OPTIONS_DROP_PERIODS) + + names, components = self.place_names_and_components(name, osm_components, country=country, languages=languages) + + phrase_filter = PhraseFilter([(n, '') for n in names]) + + stripped = [] + + for is_phrase, tokens, value in phrases: + if not is_phrase: + t, c = tokens + if stripped and c not in (token_types.IDEOGRAPHIC_CHAR, token_types.IDEOGRAPHIC_NUMBER): + stripped.append(u' ') + stripped.append(t) + + name = u''.join(stripped) + + if self.parens_regex.search(name): + name = self.parens_regex.sub(six.u(''), name).strip() + + # If the name contains a comma, stop and only use the phrase before the comma + if ',' in name: + return name.split(',', 1)[0].strip() + + return name + parens_regex = re.compile('\(.*?\)') def normalized_place_name(self, name, tag, osm_components, country=None, languages=None, phrase_from_component=False):