[addresses] add strip_components method which simply removes the names of OSM components from a string (for e.g. postal codes)
This commit is contained in:
@@ -408,6 +408,39 @@ class AddressComponents(object):
|
|||||||
|
|
||||||
return names, components
|
return names, components
|
||||||
|
|
||||||
|
def strip_components(self, name, osm_components, country, languages):
|
||||||
|
if not name or not osm_components:
|
||||||
|
return name
|
||||||
|
|
||||||
|
tokens = tokenize(name)
|
||||||
|
|
||||||
|
tokens_lower = normalized_tokens(name, string_options=NORMALIZE_STRING_LOWERCASE,
|
||||||
|
token_options=TOKEN_OPTIONS_DROP_PERIODS)
|
||||||
|
|
||||||
|
names, components = self.place_names_and_components(name, osm_components, country=country, languages=languages)
|
||||||
|
|
||||||
|
phrase_filter = PhraseFilter([(n, '') for n in names])
|
||||||
|
|
||||||
|
stripped = []
|
||||||
|
|
||||||
|
for is_phrase, tokens, value in phrases:
|
||||||
|
if not is_phrase:
|
||||||
|
t, c = tokens
|
||||||
|
if stripped and c not in (token_types.IDEOGRAPHIC_CHAR, token_types.IDEOGRAPHIC_NUMBER):
|
||||||
|
stripped.append(u' ')
|
||||||
|
stripped.append(t)
|
||||||
|
|
||||||
|
name = u''.join(stripped)
|
||||||
|
|
||||||
|
if self.parens_regex.search(name):
|
||||||
|
name = self.parens_regex.sub(six.u(''), name).strip()
|
||||||
|
|
||||||
|
# If the name contains a comma, stop and only use the phrase before the comma
|
||||||
|
if ',' in name:
|
||||||
|
return name.split(',', 1)[0].strip()
|
||||||
|
|
||||||
|
return name
|
||||||
|
|
||||||
parens_regex = re.compile('\(.*?\)')
|
parens_regex = re.compile('\(.*?\)')
|
||||||
|
|
||||||
def normalized_place_name(self, name, tag, osm_components, country=None, languages=None, phrase_from_component=False):
|
def normalized_place_name(self, name, tag, osm_components, country=None, languages=None, phrase_from_component=False):
|
||||||
|
|||||||
Reference in New Issue
Block a user