diff --git a/python/postal/text/normalize.py b/python/postal/text/normalize.py index b9700433..0fbe0f4e 100644 --- a/python/postal/text/normalize.py +++ b/python/postal/text/normalize.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from postal.text import _normalize from postal.text import _tokenize +from postal.text.token_types import token_types from postal.text.encoding import safe_decode @@ -40,4 +41,4 @@ def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS, # Tuples of (offset, len, type) tokens = _tokenize.tokenize(normalized) - return [_normalize.normalize_token(normalized, t, token_options) for t in tokens] + return [(_normalize.normalize_token(normalized, t, token_options), token_types.from_id(t[-1])) for t in tokens] diff --git a/python/postal/text/token_types.py b/python/postal/text/token_types.py index 4c8db65a..2fe433e5 100644 --- a/python/postal/text/token_types.py +++ b/python/postal/text/token_types.py @@ -52,11 +52,17 @@ class token_types(Enum): WHITESPACE = EnumValue(300) NEWLINE = EnumValue(301) + WORD_TOKEN_TYPES = set([ + WORD, + ABBREVIATION, + IDEOGRAPHIC_CHAR, + HANGUL_SYLLABLE, + ACRONYM + ]) -word_token_types = set([ - token_types.WORD, - token_types.ABBREVIATION, - token_types.IDEOGRAPHIC_CHAR, - token_types.HANGUL_SYLLABLE, - token_types.ACRONYM -]) + NUMERIC_TOKEN_TYPES = set([ + NUMERIC, + ORDINAL, + ROMAN_NUMERAL, + IDEOGRAPHIC_NUMBER, + ])