[tokenization] Adding two more sets to token_types for punctuation and non-alphanumerics

This commit is contained in:
Al
2016-08-02 16:24:01 -04:00
parent c40ad99ec7
commit 97a2436ad7

View File

@@ -69,3 +69,36 @@ class token_types(Enum):
ROMAN_NUMERAL,
IDEOGRAPHIC_NUMBER,
])
PUNCTUATION_TOKEN_TYPES = set([
PERIOD,
EXCLAMATION,
QUESTION_MARK,
COMMA,
COLON,
SEMICOLON,
PLUS,
AMPERSAND,
AT_SIGN,
POUND,
ELLIPSIS,
DASH,
BREAKING_DASH,
HYPHEN,
PUNCT_OPEN,
PUNCT_CLOSE,
DOUBLE_QUOTE,
SINGLE_QUOTE,
OPEN_QUOTE,
CLOSE_QUOTE,
SLASH,
BACKSLASH,
GREATER_THAN,
LESS_THAN,
])
NON_ALPHANUMERIC_TOKEN_TYPES = PUNCTUATION_TOKEN_TYPES | set([
OTHER,
WHITESPACE,
NEWLINE,
])