From cbeb08f1d16c978efb9197ea41324cf65589fb16 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 30 Oct 2015 12:34:07 -0400 Subject: [PATCH] [python/normalize] importing options from the C module --- python/postal/text/normalize.py | 43 ++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/python/postal/text/normalize.py b/python/postal/text/normalize.py index f5d0608b..9993a5eb 100644 --- a/python/postal/text/normalize.py +++ b/python/postal/text/normalize.py @@ -5,19 +5,38 @@ from postal.text.token_types import token_types from postal.text.encoding import safe_decode -DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_STRING_LATIN_ASCII | \ - _normalize.NORMALIZE_STRING_DECOMPOSE | \ - _normalize.NORMALIZE_STRING_TRIM | \ - _normalize.NORMALIZE_STRING_REPLACE_HYPHENS | \ - _normalize.NORMALIZE_STRING_STRIP_ACCENTS | \ - _normalize.NORMALIZE_STRING_LOWERCASE +# String options +NORMALIZE_STRING_LATIN_ASCII = _normalize.NORMALIZE_STRING_LATIN_ASCII +NORMALIZE_STRING_TRANSLITERATE = _normalize.NORMALIZE_STRING_TRANSLITERATE +NORMALIZE_STRING_STRIP_ACCENTS = _normalize.NORMALIZE_STRING_STRIP_ACCENTS +NORMALIZE_STRING_DECOMPOSE = _normalize.NORMALIZE_STRING_DECOMPOSE +NORMALIZE_STRING_LOWERCASE = _normalize.NORMALIZE_STRING_LOWERCASE +NORMALIZE_STRING_TRIM = _normalize.NORMALIZE_STRING_TRIM +NORMALIZE_STRING_REPLACE_HYPHENS = _normalize.NORMALIZE_STRING_REPLACE_HYPHENS -DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS | \ - _normalize.NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \ - _normalize.NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \ - _normalize.NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \ - _normalize.NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE | \ - _normalize.NORMALIZE_TOKEN_REPLACE_DIGITS +DEFAULT_STRING_OPTIONS = NORMALIZE_STRING_LATIN_ASCII | \ + NORMALIZE_STRING_DECOMPOSE | \ + NORMALIZE_STRING_TRIM | \ + NORMALIZE_STRING_REPLACE_HYPHENS | \ + NORMALIZE_STRING_STRIP_ACCENTS | \ + NORMALIZE_STRING_LOWERCASE + +# Token options +NORMALIZE_TOKEN_REPLACE_HYPHENS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS +NORMALIZE_TOKEN_DELETE_HYPHENS = _normalize.NORMALIZE_TOKEN_DELETE_HYPHENS +NORMALIZE_TOKEN_DELETE_FINAL_PERIOD = _normalize.NORMALIZE_TOKEN_DELETE_FINAL_PERIOD +NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS = _normalize.NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS +NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES = _normalize.NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES +NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE = _normalize.NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE +NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC = _normalize.NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC +NORMALIZE_TOKEN_REPLACE_DIGITS = _normalize.NORMALIZE_TOKEN_REPLACE_DIGITS + +DEFAULT_TOKEN_OPTIONS = NORMALIZE_TOKEN_REPLACE_HYPHENS | \ + NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \ + NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \ + NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \ + NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE | \ + NORMALIZE_TOKEN_REPLACE_DIGITS def remove_parens(tokens):