From 1fd4fbb7a27a56098d00751ada58d8891cd7a9a4 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 28 Apr 2016 13:03:16 -0400 Subject: [PATCH] [normalization] Adding default token options for numbers so we split alpha from numeric tokens and don't normalize digits --- scripts/geodata/text/normalize.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/geodata/text/normalize.py b/scripts/geodata/text/normalize.py index 253425a8..78448e6a 100644 --- a/scripts/geodata/text/normalize.py +++ b/scripts/geodata/text/normalize.py @@ -38,6 +38,8 @@ DEFAULT_TOKEN_OPTIONS = NORMALIZE_TOKEN_REPLACE_HYPHENS | \ NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE | \ NORMALIZE_TOKEN_REPLACE_DIGITS +DEFAULT_TOKEN_OPTIONS_NUMERIC = (DEFAULT_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS) | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC + def remove_parens(tokens): new_tokens = []