[python] Adding initial pypostal bindings for tokenize so we can remove address_normalizer dependency. Not tested on Python 3.

2015-09-20 14:59:33 -04:00
parent 3fab0f984f
commit 5485ea2197
8 changed files with 319 additions and 0 deletions
--- a/python/postal/init.py
+++ b/python/postal/init.py
--- a/python/postal/text/init.py
+++ b/python/postal/text/init.py
--- a/python/postal/text/encoding.py
+++ b/python/postal/text/encoding.py
@@ -0,0 +1,34 @@
+import six
+
+text_type = six.text_type
+string_types = six.string_types
+binary_type = six.binary_type
+
+
+def safe_decode(value, encoding='utf-8', errors='strict'):
+    if isinstance(value, text_type):
+        return value
+
+    if isinstance(value, (string_types, binary_type)):
+        return value.decode(encoding, errors)
+    else:
+        return binary_type(value).decode(encoding, errors)
+
+
+def safe_encode(value, incoming=None, encoding='utf-8', errors='strict'):
+    if not isinstance(value, (string_types, binary_type)):
+        return binary_type(value)
+
+    if isinstance(value, text_type):
+        return value.encode(encoding, errors)
+    else:
+        if hasattr(incoming, 'lower'):
+            incoming = incoming.lower()
+        if hasattr(encoding, 'lower'):
+            encoding = encoding.lower()
+
+        if value and encoding != incoming:
+            value = safe_decode(value, encoding, errors)
+            return value.encode(encoding, errors)
+        else:
+            return value
--- a/python/postal/text/pytokenize.c
+++ b/python/postal/text/pytokenize.c
@@ -0,0 +1,158 @@
+#include <Python.h>
+
+#include "scanner.h"
+
+#if PY_MAJOR_VERSION >= 3
+#define IS_PY3K
+#endif
+
+struct module_state {
+    PyObject *error;
+};
+
+
+#ifdef IS_PY3K
+    #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
+#else
+    #define GETSTATE(m) (&_state)
+    static struct module_state _state;
+#endif
+
+
+static PyObject *py_tokenize(PyObject *self, PyObject *args) 
+{
+    PyObject *arg1;
+    if (!PyArg_ParseTuple(args, "O:tokenize", &arg1)) {
+        return 0;
+    }
+
+    PyObject *unistr = PyUnicode_FromObject(arg1);
+    if (unistr == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                        "Parameter could not be converted to unicode in scanner");
+        return 0;
+    }
+
+    #ifdef IS_PY3K
+        // Python 3 encoding, supported by Python 3.3+
+
+        char *input = PyUnicode_AsUTF8(unistr);
+
+    #else
+        // Python 2 encoding
+
+        PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
+        if (str == NULL) {
+            PyErr_SetString(PyExc_TypeError,
+                            "Parameter could not be utf-8 encoded");
+            return 0;
+        }
+
+        char *input = PyBytes_AsString(str);
+
+    #endif
+
+
+    if (input == NULL) {
+        goto error_decref_str;
+    }
+
+    token_array *tokens = tokenize(input);
+    if (tokens == NULL) {
+        goto error_decref_str;
+    }
+
+    PyObject *result = PyList_New(0);
+    if (!result) {
+        token_array_destroy(tokens);
+        goto error_decref_unistr;
+        return 0;
+    }
+
+    PyObject *tuple;
+
+    token_t token;
+    for (int i = 0; i < tokens->n; i++) {
+        token = tokens->a[i];
+        tuple = Py_BuildValue("iii", token.offset, token.len, token.type);
+        PyList_Append(result, tuple);
+        Py_XDECREF(tuple);
+    }
+
+    Py_XDECREF(str);
+    Py_XDECREF(unistr);
+
+    token_array_destroy(tokens);
+
+    return result;
+
+error_decref_str:
+    Py_XDECREF(str);
+error_decref_unistr:
+    Py_XDECREF(unistr);
+    return 0;
+}
+
+static PyMethodDef tokenize_methods[] = {
+    {"tokenize", (PyCFunction)py_tokenize, METH_VARARGS, "tokenize(text)"},
+    {NULL, NULL},
+};
+
+
+
+#ifdef IS_PY3K
+
+static int tokenize_traverse(PyObject *m, visitproc visit, void *arg) {
+    Py_VISIT(GETSTATE(m)->error);
+    return 0;
+}
+
+static int tokenize_clear(PyObject *m) {
+    Py_CLEAR(GETSTATE(m)->error);
+    return 0;
+}
+
+
+static struct PyModuleDef module_def = {
+        PyModuleDef_HEAD_INIT,
+        "_tokenize",
+        NULL,
+        sizeof(struct module_state),
+        tokenize_methods,
+        NULL,
+        tokenize_traverse,
+        tokenize_clear,
+        NULL
+};
+
+#define INITERROR return NULL
+
+PyObject *
+PyInit_tokenize(void)
+#else
+#define INITERROR return
+
+void
+init_tokenize(void) {
+#endif
+
+#ifdef IS_PY3K
+    PyObject *module = PyModule_Create(&module_def);
+#else
+    PyObject *module = Py_InitModule("_tokenize", tokenize_methods);
+#endif
+
+    if (module == NULL)
+        INITERROR;
+    struct module_state *st = GETSTATE(module);
+
+    st->error = PyErr_NewException("_tokenize.Error", NULL, NULL);
+    if (st->error == NULL) {
+        Py_DECREF(module);
+        INITERROR;
+    }
+
+#if PY_MAJOR_VERSION >= 3
+    return module;
+#endif
+}
--- a/python/postal/text/token_types.py
+++ b/python/postal/text/token_types.py
@@ -0,0 +1,53 @@
+from postal.utils.enum import Enum, EnumValue
+
+
+class token_types(Enum):
+    # Word types
+    WORD = EnumValue(1)
+    ABBREVIATION = EnumValue(2)
+    IDEOGRAPHIC_CHAR = EnumValue(3)
+    HANGUL_SYLLABLE = EnumValue(4)
+    ACRONYM = EnumValue(5)
+
+    # Special tokens
+    EMAIL = EnumValue(20)
+    URL = EnumValue(21)
+    US_PHONE = EnumValue(22)
+    INTL_PHONE = EnumValue(23)
+
+    # Numbers and numeric types
+    NUMERIC = EnumValue(50)
+    ORDINAL = EnumValue(51)
+    ROMAN_NUMERAL = EnumValue(52)
+    IDEOGRAPHIC_NUMBER = EnumValue(53)
+
+    # Punctuation types, may separate a phrase
+    PERIOD = EnumValue(100)
+    EXCLAMATION = EnumValue(101)
+    QUESTION_MARK = EnumValue(102)
+    COMMA = EnumValue(103)
+    COLON = EnumValue(104)
+    SEMICOLON = EnumValue(105)
+    PLUS = EnumValue(106)
+    AMPERSAND = EnumValue(107)
+    AT_SIGN = EnumValue(108)
+    POUND = EnumValue(109)
+    ELLIPSIS = EnumValue(110)
+    DASH = EnumValue(111)
+    BREAKING_DASH = EnumValue(112)
+    HYPHEN = EnumValue(113)
+    PUNCT_OPEN = EnumValue(114)
+    PUNCT_CLOSE = EnumValue(115)
+    DOUBLE_QUOTE = EnumValue(119)
+    SINGLE_QUOTE = EnumValue(120)
+    OPEN_QUOTE = EnumValue(121)
+    CLOSE_QUOTE = EnumValue(122)
+    SLASH = EnumValue(124)
+    BACKSLASH = EnumValue(125)
+    GREATER_THAN = EnumValue(126)
+    LESS_THAN = EnumValue(127)
+
+    # Non-letters and whitespace
+    OTHER = EnumValue(200)
+    WHITESPACE = EnumValue(300)
+    NEWLINE = EnumValue(301)
--- a/python/postal/text/tokenize.py
+++ b/python/postal/text/tokenize.py
@@ -0,0 +1,12 @@
+from postal.text.encoding import safe_decode
+from postal.text import _tokenize
+from postal.text.token_types import token_types
+
+
+def tokenize_raw(s):
+    return _tokenize.tokenize(safe_decode(s))
+
+
+def tokenize(s):
+    return [(s[start:start + length], token_types.from_id(token_type))
+            for start, length, token_type in _tokenize.tokenize(safe_decode(s))]
--- a/python/postal/utils/init.py
+++ b/python/postal/utils/init.py
--- a/python/postal/utils/enum.py
+++ b/python/postal/utils/enum.py
@@ -0,0 +1,62 @@
+
+class EnumValue(object):
+    def __init__(self, value, name=None):
+        self.value = value
+        self.name = name
+
+    def __hash__(self):
+        return self.value
+
+    def __cmp__(self, other):
+        if isinstance(other, EnumValue):
+            return self.value.__cmp__(other.value)
+        else:
+            return self.value.__cmp__(other)
+
+    def __unicode__(self):
+        return self.name
+
+    def __str__(self):
+        return self.name
+
+    def __repr__(self):
+        return self.name
+
+
+class EnumMeta(type):
+    def __init__(self, name, bases, dict_):
+        self.registry = self.registry.copy()
+        self.name_registry = self.name_registry.copy()
+        for k, v in dict_.iteritems():
+            if isinstance(v, EnumValue) and v not in self.registry:
+                if v.name is None:
+                    v.name = k
+                self.registry[v.value] = v
+                self.name_registry[v.name] = v
+        return super(EnumMeta, self).__init__(name, bases, dict_)
+
+    def __iter__(self):
+        return self.registry.itervalues()
+
+    def __getitem__(self, key):
+        return self.registry[key]
+
+
+class Enum(object):
+    __metaclass__ = EnumMeta
+    registry = {}
+    name_registry = {}
+
+    @classmethod
+    def from_id(cls, value):
+        try:
+            return cls.registry[value]
+        except KeyError:
+            raise ValueError('Invalid value for {}: {}'.format(cls.__name__, value))
+
+    @classmethod
+    def from_string(cls, name):
+        try:
+            return cls.name_registry[name]
+        except KeyError:
+            raise ValueError('Invalid name for {}: {}'.format(cls.__name__, name))