From ee1aa564c40e164a08105ecc2160dc61784a5ade Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 28 Apr 2016 18:03:44 -0400 Subject: [PATCH] [normalization] normalize tokens should not replace digits by default --- scripts/geodata/names/deduping.py | 7 +++---- scripts/geodata/text/normalize.py | 5 ++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/scripts/geodata/names/deduping.py b/scripts/geodata/names/deduping.py index bda2d125..c2457b0e 100644 --- a/scripts/geodata/names/deduping.py +++ b/scripts/geodata/names/deduping.py @@ -39,8 +39,7 @@ class NameDeduper(object): @classmethod def tokenize(cls, s): - token_options = DEFAULT_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS - return normalized_tokens(s, token_options=token_options) + return normalized_tokens(s) @classmethod def content_tokens(cls, s): @@ -49,8 +48,8 @@ class NameDeduper(object): tokens = remove_parens(tokens) return [(cls.replacements.get(t, t), c) for t, c in tokens - if c in cls.content_categories - and t not in cls.stopwords] + if c in cls.content_categories and + t not in cls.stopwords] @classmethod def possible_match(cls, tokens1, tokens2): diff --git a/scripts/geodata/text/normalize.py b/scripts/geodata/text/normalize.py index 78448e6a..ca6cf10f 100644 --- a/scripts/geodata/text/normalize.py +++ b/scripts/geodata/text/normalize.py @@ -35,10 +35,9 @@ DEFAULT_TOKEN_OPTIONS = NORMALIZE_TOKEN_REPLACE_HYPHENS | \ NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \ NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \ NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \ - NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE | \ - NORMALIZE_TOKEN_REPLACE_DIGITS + NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE -DEFAULT_TOKEN_OPTIONS_NUMERIC = (DEFAULT_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS) | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC +DEFAULT_TOKEN_OPTIONS_NUMERIC = (DEFAULT_TOKEN_OPTIONS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC) def remove_parens(tokens):