diff --git a/scripts/geodata/names/deduping.py b/scripts/geodata/names/deduping.py index bda2d125..c2457b0e 100644 --- a/scripts/geodata/names/deduping.py +++ b/scripts/geodata/names/deduping.py @@ -39,8 +39,7 @@ class NameDeduper(object): @classmethod def tokenize(cls, s): - token_options = DEFAULT_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS - return normalized_tokens(s, token_options=token_options) + return normalized_tokens(s) @classmethod def content_tokens(cls, s): @@ -49,8 +48,8 @@ class NameDeduper(object): tokens = remove_parens(tokens) return [(cls.replacements.get(t, t), c) for t, c in tokens - if c in cls.content_categories - and t not in cls.stopwords] + if c in cls.content_categories and + t not in cls.stopwords] @classmethod def possible_match(cls, tokens1, tokens2): diff --git a/scripts/geodata/text/normalize.py b/scripts/geodata/text/normalize.py index 78448e6a..ca6cf10f 100644 --- a/scripts/geodata/text/normalize.py +++ b/scripts/geodata/text/normalize.py @@ -35,10 +35,9 @@ DEFAULT_TOKEN_OPTIONS = NORMALIZE_TOKEN_REPLACE_HYPHENS | \ NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \ NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \ NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \ - NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE | \ - NORMALIZE_TOKEN_REPLACE_DIGITS + NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE -DEFAULT_TOKEN_OPTIONS_NUMERIC = (DEFAULT_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS) | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC +DEFAULT_TOKEN_OPTIONS_NUMERIC = (DEFAULT_TOKEN_OPTIONS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC) def remove_parens(tokens):