[normalization] Adding default token options for numbers so we split alpha from numeric tokens and don't normalize digits

This commit is contained in:
Al
2016-04-28 13:03:16 -04:00
parent 3d765e9eca
commit 1fd4fbb7a2

View File

@@ -38,6 +38,8 @@ DEFAULT_TOKEN_OPTIONS = NORMALIZE_TOKEN_REPLACE_HYPHENS | \
NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE | \
NORMALIZE_TOKEN_REPLACE_DIGITS
DEFAULT_TOKEN_OPTIONS_NUMERIC = (DEFAULT_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS) | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC
def remove_parens(tokens):
new_tokens = []