[normalization] adding a normalize_token function and some token options for deleting periods
This commit is contained in:
@@ -37,6 +37,9 @@ DEFAULT_TOKEN_OPTIONS = NORMALIZE_TOKEN_REPLACE_HYPHENS | \
|
|||||||
NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \
|
NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \
|
||||||
NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE
|
NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE
|
||||||
|
|
||||||
|
TOKEN_OPTIONS_DROP_PERIODS = NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \
|
||||||
|
NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS
|
||||||
|
|
||||||
DEFAULT_TOKEN_OPTIONS_NUMERIC = (DEFAULT_TOKEN_OPTIONS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC)
|
DEFAULT_TOKEN_OPTIONS_NUMERIC = (DEFAULT_TOKEN_OPTIONS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC)
|
||||||
|
|
||||||
|
|
||||||
@@ -64,6 +67,10 @@ def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS):
|
|||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_token(s, t, token_options=DEFAULT_TOKEN_OPTIONS):
|
||||||
|
return _normalize.normalize_token(s, t, token_options)
|
||||||
|
|
||||||
|
|
||||||
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
|
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
|
||||||
token_options=DEFAULT_TOKEN_OPTIONS,
|
token_options=DEFAULT_TOKEN_OPTIONS,
|
||||||
strip_parentheticals=True):
|
strip_parentheticals=True):
|
||||||
|
|||||||
Reference in New Issue
Block a user