[tokenization] Adding two more sets to token_types for punctuation and non-alphanumerics
This commit is contained in:
@@ -69,3 +69,36 @@ class token_types(Enum):
|
|||||||
ROMAN_NUMERAL,
|
ROMAN_NUMERAL,
|
||||||
IDEOGRAPHIC_NUMBER,
|
IDEOGRAPHIC_NUMBER,
|
||||||
])
|
])
|
||||||
|
|
||||||
|
PUNCTUATION_TOKEN_TYPES = set([
|
||||||
|
PERIOD,
|
||||||
|
EXCLAMATION,
|
||||||
|
QUESTION_MARK,
|
||||||
|
COMMA,
|
||||||
|
COLON,
|
||||||
|
SEMICOLON,
|
||||||
|
PLUS,
|
||||||
|
AMPERSAND,
|
||||||
|
AT_SIGN,
|
||||||
|
POUND,
|
||||||
|
ELLIPSIS,
|
||||||
|
DASH,
|
||||||
|
BREAKING_DASH,
|
||||||
|
HYPHEN,
|
||||||
|
PUNCT_OPEN,
|
||||||
|
PUNCT_CLOSE,
|
||||||
|
DOUBLE_QUOTE,
|
||||||
|
SINGLE_QUOTE,
|
||||||
|
OPEN_QUOTE,
|
||||||
|
CLOSE_QUOTE,
|
||||||
|
SLASH,
|
||||||
|
BACKSLASH,
|
||||||
|
GREATER_THAN,
|
||||||
|
LESS_THAN,
|
||||||
|
])
|
||||||
|
|
||||||
|
NON_ALPHANUMERIC_TOKEN_TYPES = PUNCTUATION_TOKEN_TYPES | set([
|
||||||
|
OTHER,
|
||||||
|
WHITESPACE,
|
||||||
|
NEWLINE,
|
||||||
|
])
|
||||||
|
|||||||
Reference in New Issue
Block a user