[python] Making normalized_tokens return token classes as well, mimicking the tokenize API
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from postal.text import _normalize
|
||||
from postal.text import _tokenize
|
||||
from postal.text.token_types import token_types
|
||||
|
||||
from postal.text.encoding import safe_decode
|
||||
|
||||
@@ -40,4 +41,4 @@ def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
|
||||
|
||||
# Tuples of (offset, len, type)
|
||||
tokens = _tokenize.tokenize(normalized)
|
||||
return [_normalize.normalize_token(normalized, t, token_options) for t in tokens]
|
||||
return [(_normalize.normalize_token(normalized, t, token_options), token_types.from_id(t[-1])) for t in tokens]
|
||||
|
||||
@@ -52,11 +52,17 @@ class token_types(Enum):
|
||||
WHITESPACE = EnumValue(300)
|
||||
NEWLINE = EnumValue(301)
|
||||
|
||||
WORD_TOKEN_TYPES = set([
|
||||
WORD,
|
||||
ABBREVIATION,
|
||||
IDEOGRAPHIC_CHAR,
|
||||
HANGUL_SYLLABLE,
|
||||
ACRONYM
|
||||
])
|
||||
|
||||
word_token_types = set([
|
||||
token_types.WORD,
|
||||
token_types.ABBREVIATION,
|
||||
token_types.IDEOGRAPHIC_CHAR,
|
||||
token_types.HANGUL_SYLLABLE,
|
||||
token_types.ACRONYM
|
||||
])
|
||||
NUMERIC_TOKEN_TYPES = set([
|
||||
NUMERIC,
|
||||
ORDINAL,
|
||||
ROMAN_NUMERAL,
|
||||
IDEOGRAPHIC_NUMBER,
|
||||
])
|
||||
|
||||
Reference in New Issue
Block a user