[normalization] normalize tokens should not replace digits by default
This commit is contained in:
@@ -39,8 +39,7 @@ class NameDeduper(object):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def tokenize(cls, s):
|
def tokenize(cls, s):
|
||||||
token_options = DEFAULT_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS
|
return normalized_tokens(s)
|
||||||
return normalized_tokens(s, token_options=token_options)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def content_tokens(cls, s):
|
def content_tokens(cls, s):
|
||||||
@@ -49,8 +48,8 @@ class NameDeduper(object):
|
|||||||
tokens = remove_parens(tokens)
|
tokens = remove_parens(tokens)
|
||||||
return [(cls.replacements.get(t, t), c)
|
return [(cls.replacements.get(t, t), c)
|
||||||
for t, c in tokens
|
for t, c in tokens
|
||||||
if c in cls.content_categories
|
if c in cls.content_categories and
|
||||||
and t not in cls.stopwords]
|
t not in cls.stopwords]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def possible_match(cls, tokens1, tokens2):
|
def possible_match(cls, tokens1, tokens2):
|
||||||
|
|||||||
@@ -35,10 +35,9 @@ DEFAULT_TOKEN_OPTIONS = NORMALIZE_TOKEN_REPLACE_HYPHENS | \
|
|||||||
NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \
|
NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \
|
||||||
NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \
|
NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \
|
||||||
NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \
|
NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \
|
||||||
NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE | \
|
NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE
|
||||||
NORMALIZE_TOKEN_REPLACE_DIGITS
|
|
||||||
|
|
||||||
DEFAULT_TOKEN_OPTIONS_NUMERIC = (DEFAULT_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS) | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC
|
DEFAULT_TOKEN_OPTIONS_NUMERIC = (DEFAULT_TOKEN_OPTIONS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC)
|
||||||
|
|
||||||
|
|
||||||
def remove_parens(tokens):
|
def remove_parens(tokens):
|
||||||
|
|||||||
Reference in New Issue
Block a user