[python/normalization] Adding Python bindings to the normalize module for use in OSM polygon matching
This commit is contained in:
29
python/postal/text/normalize.py
Normal file
29
python/postal/text/normalize.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from postal.text import _normalize
|
||||
from postal.text import _tokenize
|
||||
|
||||
from postal.text.encoding import safe_decode
|
||||
|
||||
DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_STRING_LATIN_ASCII | \
|
||||
_normalize.NORMALIZE_STRING_DECOMPOSE | \
|
||||
_normalize.NORMALIZE_STRING_TRIM | \
|
||||
_normalize.NORMALIZE_STRING_LOWERCASE
|
||||
|
||||
DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \
|
||||
_normalize.NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \
|
||||
_normalize.NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \
|
||||
_normalize.NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \
|
||||
_normalize.NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE | \
|
||||
_normalize.NORMALIZE_TOKEN_REPLACE_DIGITS
|
||||
|
||||
|
||||
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
|
||||
token_options=DEFAULT_TOKEN_OPTIONS):
|
||||
s = safe_decode(s)
|
||||
if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII:
|
||||
normalized = _normalize.normalize_string_latin(s, string_options)
|
||||
else:
|
||||
normalized = _normalize.normalize_string_utf8(s, string_options)
|
||||
|
||||
# Tuples of (offset, len, type)
|
||||
tokens = _tokenize.tokenize(normalized)
|
||||
return [_normalize.normalize_token(normalized, t, token_options) for t in tokens]
|
||||
Reference in New Issue
Block a user