[addresses] add strip_components method which simply removes the names of OSM components from a string (for e.g. postal codes)

This commit is contained in:
Al
2017-02-10 23:57:20 -05:00
parent 4e1d7d9373
commit bbcb6444c8

View File

@@ -408,6 +408,39 @@ class AddressComponents(object):
return names, components return names, components
def strip_components(self, name, osm_components, country, languages):
if not name or not osm_components:
return name
tokens = tokenize(name)
tokens_lower = normalized_tokens(name, string_options=NORMALIZE_STRING_LOWERCASE,
token_options=TOKEN_OPTIONS_DROP_PERIODS)
names, components = self.place_names_and_components(name, osm_components, country=country, languages=languages)
phrase_filter = PhraseFilter([(n, '') for n in names])
stripped = []
for is_phrase, tokens, value in phrases:
if not is_phrase:
t, c = tokens
if stripped and c not in (token_types.IDEOGRAPHIC_CHAR, token_types.IDEOGRAPHIC_NUMBER):
stripped.append(u' ')
stripped.append(t)
name = u''.join(stripped)
if self.parens_regex.search(name):
name = self.parens_regex.sub(six.u(''), name).strip()
# If the name contains a comma, stop and only use the phrase before the comma
if ',' in name:
return name.split(',', 1)[0].strip()
return name
parens_regex = re.compile('\(.*?\)') parens_regex = re.compile('\(.*?\)')
def normalized_place_name(self, name, tag, osm_components, country=None, languages=None, phrase_from_component=False): def normalized_place_name(self, name, tag, osm_components, country=None, languages=None, phrase_from_component=False):