[python/normalization] Adding Python bindings to the normalize module for use in OSM polygon matching

This commit is contained in:
Al
2015-10-26 18:07:37 -04:00
parent a319c1f6a0
commit f6b6a17335
3 changed files with 379 additions and 9 deletions

View File

@@ -0,0 +1,29 @@
from postal.text import _normalize
from postal.text import _tokenize
from postal.text.encoding import safe_decode
DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_STRING_LATIN_ASCII | \
_normalize.NORMALIZE_STRING_DECOMPOSE | \
_normalize.NORMALIZE_STRING_TRIM | \
_normalize.NORMALIZE_STRING_LOWERCASE
DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \
_normalize.NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \
_normalize.NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \
_normalize.NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \
_normalize.NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE | \
_normalize.NORMALIZE_TOKEN_REPLACE_DIGITS
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
token_options=DEFAULT_TOKEN_OPTIONS):
s = safe_decode(s)
if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII:
normalized = _normalize.normalize_string_latin(s, string_options)
else:
normalized = _normalize.normalize_string_utf8(s, string_options)
# Tuples of (offset, len, type)
tokens = _tokenize.tokenize(normalized)
return [_normalize.normalize_token(normalized, t, token_options) for t in tokens]