[normalization] normalize tokens should not replace digits by default

2016-04-28 18:03:44 -04:00
parent 3a9ac9d96f
commit ee1aa564c4
2 changed files with 5 additions and 7 deletions
--- a/scripts/geodata/names/deduping.py
+++ b/scripts/geodata/names/deduping.py
@@ -39,8 +39,7 @@ class NameDeduper(object):

    @classmethod
    def tokenize(cls, s):
-        token_options = DEFAULT_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS
-        return normalized_tokens(s, token_options=token_options)
+        return normalized_tokens(s)

    @classmethod
    def content_tokens(cls, s):
@@ -49,8 +48,8 @@ class NameDeduper(object):
            tokens = remove_parens(tokens)
        return [(cls.replacements.get(t, t), c)
                for t, c in tokens
-                if c in cls.content_categories
-                and t not in cls.stopwords]
+                if c in cls.content_categories and
+                t not in cls.stopwords]

    @classmethod
    def possible_match(cls, tokens1, tokens2):