[python] Making normalized_tokens return token classes as well, mimicking the tokenize API

2015-10-27 14:13:49 -04:00
parent 7f5f056105
commit 9a92a1154d
2 changed files with 15 additions and 8 deletions
--- a/python/postal/text/normalize.py
+++ b/python/postal/text/normalize.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 from postal.text import _normalize
 from postal.text import _tokenize
+from postal.text.token_types import token_types

 from postal.text.encoding import safe_decode

@@ -40,4 +41,4 @@ def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,

    # Tuples of (offset, len, type)
    tokens = _tokenize.tokenize(normalized)
-    return [_normalize.normalize_token(normalized, t, token_options) for t in tokens]
+    return [(_normalize.normalize_token(normalized, t, token_options), token_types.from_id(t[-1])) for t in tokens]
--- a/python/postal/text/token_types.py
+++ b/python/postal/text/token_types.py
@@ -52,11 +52,17 @@ class token_types(Enum):
    WHITESPACE = EnumValue(300)
    NEWLINE = EnumValue(301)

+    WORD_TOKEN_TYPES = set([
+        WORD,
+        ABBREVIATION,
+        IDEOGRAPHIC_CHAR,
+        HANGUL_SYLLABLE,
+        ACRONYM
+    ])

-word_token_types = set([
-    token_types.WORD,
-    token_types.ABBREVIATION,
-    token_types.IDEOGRAPHIC_CHAR,
-    token_types.HANGUL_SYLLABLE,
-    token_types.ACRONYM
-])
+    NUMERIC_TOKEN_TYPES = set([
+        NUMERIC,
+        ORDINAL,
+        ROMAN_NUMERAL,
+        IDEOGRAPHIC_NUMBER,
+    ])