From 97a2436ad710cde9d76ddd586d56959413623651 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 2 Aug 2016 16:24:01 -0400 Subject: [PATCH] [tokenization] Adding two more sets to token_types for punctuation and non-alphanumerics --- scripts/geodata/text/token_types.py | 33 +++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/scripts/geodata/text/token_types.py b/scripts/geodata/text/token_types.py index e56afab8..d94a734a 100644 --- a/scripts/geodata/text/token_types.py +++ b/scripts/geodata/text/token_types.py @@ -69,3 +69,36 @@ class token_types(Enum): ROMAN_NUMERAL, IDEOGRAPHIC_NUMBER, ]) + + PUNCTUATION_TOKEN_TYPES = set([ + PERIOD, + EXCLAMATION, + QUESTION_MARK, + COMMA, + COLON, + SEMICOLON, + PLUS, + AMPERSAND, + AT_SIGN, + POUND, + ELLIPSIS, + DASH, + BREAKING_DASH, + HYPHEN, + PUNCT_OPEN, + PUNCT_CLOSE, + DOUBLE_QUOTE, + SINGLE_QUOTE, + OPEN_QUOTE, + CLOSE_QUOTE, + SLASH, + BACKSLASH, + GREATER_THAN, + LESS_THAN, + ]) + + NON_ALPHANUMERIC_TOKEN_TYPES = PUNCTUATION_TOKEN_TYPES | set([ + OTHER, + WHITESPACE, + NEWLINE, + ])