[normalize] Adding hyphen elimination as a string option (changes tokenization)

This commit is contained in:
Al
2015-10-27 13:32:47 -04:00
parent 3fe2365234
commit 40918812e2
6 changed files with 36 additions and 4 deletions

View File

@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
from postal.text import _normalize
from postal.text import _tokenize
@@ -6,6 +7,8 @@ from postal.text.encoding import safe_decode
DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_STRING_LATIN_ASCII | \
_normalize.NORMALIZE_STRING_DECOMPOSE | \
_normalize.NORMALIZE_STRING_TRIM | \
_normalize.NORMALIZE_STRING_REPLACE_HYPHENS | \
_normalize.NORMALIZE_STRING_STRIP_ACCENTS | \
_normalize.NORMALIZE_STRING_LOWERCASE
DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \
@@ -18,6 +21,17 @@ DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
token_options=DEFAULT_TOKEN_OPTIONS):
'''
Normalizes a string, tokenizes, and normalizes each token
with string and token-level options.
This version only uses libpostal's deterministic normalizations
i.e. methods with a single output. The string tree version will
return multiple normalized strings, each with tokens.
Usage:
normalized_tokens(u'St.-Barthélemy')
'''
s = safe_decode(s)
if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII:
normalized = _normalize.normalize_string_latin(s, string_options)