[python] Making normalized_tokens return token classes as well, mimicking the tokenize API
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from postal.text import _normalize
|
from postal.text import _normalize
|
||||||
from postal.text import _tokenize
|
from postal.text import _tokenize
|
||||||
|
from postal.text.token_types import token_types
|
||||||
|
|
||||||
from postal.text.encoding import safe_decode
|
from postal.text.encoding import safe_decode
|
||||||
|
|
||||||
@@ -40,4 +41,4 @@ def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
|
|||||||
|
|
||||||
# Tuples of (offset, len, type)
|
# Tuples of (offset, len, type)
|
||||||
tokens = _tokenize.tokenize(normalized)
|
tokens = _tokenize.tokenize(normalized)
|
||||||
return [_normalize.normalize_token(normalized, t, token_options) for t in tokens]
|
return [(_normalize.normalize_token(normalized, t, token_options), token_types.from_id(t[-1])) for t in tokens]
|
||||||
|
|||||||
@@ -52,11 +52,17 @@ class token_types(Enum):
|
|||||||
WHITESPACE = EnumValue(300)
|
WHITESPACE = EnumValue(300)
|
||||||
NEWLINE = EnumValue(301)
|
NEWLINE = EnumValue(301)
|
||||||
|
|
||||||
|
WORD_TOKEN_TYPES = set([
|
||||||
|
WORD,
|
||||||
|
ABBREVIATION,
|
||||||
|
IDEOGRAPHIC_CHAR,
|
||||||
|
HANGUL_SYLLABLE,
|
||||||
|
ACRONYM
|
||||||
|
])
|
||||||
|
|
||||||
word_token_types = set([
|
NUMERIC_TOKEN_TYPES = set([
|
||||||
token_types.WORD,
|
NUMERIC,
|
||||||
token_types.ABBREVIATION,
|
ORDINAL,
|
||||||
token_types.IDEOGRAPHIC_CHAR,
|
ROMAN_NUMERAL,
|
||||||
token_types.HANGUL_SYLLABLE,
|
IDEOGRAPHIC_NUMBER,
|
||||||
token_types.ACRONYM
|
])
|
||||||
])
|
|
||||||
|
|||||||
Reference in New Issue
Block a user