[normalization] normalize tokens should not replace digits by default

This commit is contained in:
Al
2016-04-28 18:03:44 -04:00
parent 1c6844f8f3
commit 6a8b59fc90
2 changed files with 5 additions and 7 deletions

View File

@@ -39,8 +39,7 @@ class NameDeduper(object):
@classmethod @classmethod
def tokenize(cls, s): def tokenize(cls, s):
token_options = DEFAULT_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS return normalized_tokens(s)
return normalized_tokens(s, token_options=token_options)
@classmethod @classmethod
def content_tokens(cls, s): def content_tokens(cls, s):
@@ -49,8 +48,8 @@ class NameDeduper(object):
tokens = remove_parens(tokens) tokens = remove_parens(tokens)
return [(cls.replacements.get(t, t), c) return [(cls.replacements.get(t, t), c)
for t, c in tokens for t, c in tokens
if c in cls.content_categories if c in cls.content_categories and
and t not in cls.stopwords] t not in cls.stopwords]
@classmethod @classmethod
def possible_match(cls, tokens1, tokens2): def possible_match(cls, tokens1, tokens2):

View File

@@ -35,10 +35,9 @@ DEFAULT_TOKEN_OPTIONS = NORMALIZE_TOKEN_REPLACE_HYPHENS | \
NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \ NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \
NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \ NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \
NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \ NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \
NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE | \ NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE
NORMALIZE_TOKEN_REPLACE_DIGITS
DEFAULT_TOKEN_OPTIONS_NUMERIC = (DEFAULT_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS) | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC DEFAULT_TOKEN_OPTIONS_NUMERIC = (DEFAULT_TOKEN_OPTIONS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC)
def remove_parens(tokens): def remove_parens(tokens):