From 5485ea21971d2d57f7b3c84d2a1264e8812a96cf Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 20 Sep 2015 14:59:33 -0400 Subject: [PATCH] [python] Adding initial pypostal bindings for tokenize so we can remove address_normalizer dependency. Not tested on Python 3. --- python/postal/__init__.py | 0 python/postal/text/__init__.py | 0 python/postal/text/encoding.py | 34 +++++++ python/postal/text/pytokenize.c | 158 ++++++++++++++++++++++++++++++ python/postal/text/token_types.py | 53 ++++++++++ python/postal/text/tokenize.py | 12 +++ python/postal/utils/__init__.py | 0 python/postal/utils/enum.py | 62 ++++++++++++ 8 files changed, 319 insertions(+) create mode 100644 python/postal/__init__.py create mode 100644 python/postal/text/__init__.py create mode 100644 python/postal/text/encoding.py create mode 100644 python/postal/text/pytokenize.c create mode 100644 python/postal/text/token_types.py create mode 100644 python/postal/text/tokenize.py create mode 100644 python/postal/utils/__init__.py create mode 100644 python/postal/utils/enum.py diff --git a/python/postal/__init__.py b/python/postal/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/postal/text/__init__.py b/python/postal/text/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/postal/text/encoding.py b/python/postal/text/encoding.py new file mode 100644 index 00000000..b4bcbd61 --- /dev/null +++ b/python/postal/text/encoding.py @@ -0,0 +1,34 @@ +import six + +text_type = six.text_type +string_types = six.string_types +binary_type = six.binary_type + + +def safe_decode(value, encoding='utf-8', errors='strict'): + if isinstance(value, text_type): + return value + + if isinstance(value, (string_types, binary_type)): + return value.decode(encoding, errors) + else: + return binary_type(value).decode(encoding, errors) + + +def safe_encode(value, incoming=None, encoding='utf-8', errors='strict'): + if not isinstance(value, (string_types, binary_type)): + return binary_type(value) + + if isinstance(value, text_type): + return value.encode(encoding, errors) + else: + if hasattr(incoming, 'lower'): + incoming = incoming.lower() + if hasattr(encoding, 'lower'): + encoding = encoding.lower() + + if value and encoding != incoming: + value = safe_decode(value, encoding, errors) + return value.encode(encoding, errors) + else: + return value diff --git a/python/postal/text/pytokenize.c b/python/postal/text/pytokenize.c new file mode 100644 index 00000000..1527a782 --- /dev/null +++ b/python/postal/text/pytokenize.c @@ -0,0 +1,158 @@ +#include + +#include "scanner.h" + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#endif + +struct module_state { + PyObject *error; +}; + + +#ifdef IS_PY3K + #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) +#else + #define GETSTATE(m) (&_state) + static struct module_state _state; +#endif + + +static PyObject *py_tokenize(PyObject *self, PyObject *args) +{ + PyObject *arg1; + if (!PyArg_ParseTuple(args, "O:tokenize", &arg1)) { + return 0; + } + + PyObject *unistr = PyUnicode_FromObject(arg1); + if (unistr == NULL) { + PyErr_SetString(PyExc_TypeError, + "Parameter could not be converted to unicode in scanner"); + return 0; + } + + #ifdef IS_PY3K + // Python 3 encoding, supported by Python 3.3+ + + char *input = PyUnicode_AsUTF8(unistr); + + #else + // Python 2 encoding + + PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict"); + if (str == NULL) { + PyErr_SetString(PyExc_TypeError, + "Parameter could not be utf-8 encoded"); + return 0; + } + + char *input = PyBytes_AsString(str); + + #endif + + + if (input == NULL) { + goto error_decref_str; + } + + token_array *tokens = tokenize(input); + if (tokens == NULL) { + goto error_decref_str; + } + + PyObject *result = PyList_New(0); + if (!result) { + token_array_destroy(tokens); + goto error_decref_unistr; + return 0; + } + + PyObject *tuple; + + token_t token; + for (int i = 0; i < tokens->n; i++) { + token = tokens->a[i]; + tuple = Py_BuildValue("iii", token.offset, token.len, token.type); + PyList_Append(result, tuple); + Py_XDECREF(tuple); + } + + Py_XDECREF(str); + Py_XDECREF(unistr); + + token_array_destroy(tokens); + + return result; + +error_decref_str: + Py_XDECREF(str); +error_decref_unistr: + Py_XDECREF(unistr); + return 0; +} + +static PyMethodDef tokenize_methods[] = { + {"tokenize", (PyCFunction)py_tokenize, METH_VARARGS, "tokenize(text)"}, + {NULL, NULL}, +}; + + + +#ifdef IS_PY3K + +static int tokenize_traverse(PyObject *m, visitproc visit, void *arg) { + Py_VISIT(GETSTATE(m)->error); + return 0; +} + +static int tokenize_clear(PyObject *m) { + Py_CLEAR(GETSTATE(m)->error); + return 0; +} + + +static struct PyModuleDef module_def = { + PyModuleDef_HEAD_INIT, + "_tokenize", + NULL, + sizeof(struct module_state), + tokenize_methods, + NULL, + tokenize_traverse, + tokenize_clear, + NULL +}; + +#define INITERROR return NULL + +PyObject * +PyInit_tokenize(void) +#else +#define INITERROR return + +void +init_tokenize(void) { +#endif + +#ifdef IS_PY3K + PyObject *module = PyModule_Create(&module_def); +#else + PyObject *module = Py_InitModule("_tokenize", tokenize_methods); +#endif + + if (module == NULL) + INITERROR; + struct module_state *st = GETSTATE(module); + + st->error = PyErr_NewException("_tokenize.Error", NULL, NULL); + if (st->error == NULL) { + Py_DECREF(module); + INITERROR; + } + +#if PY_MAJOR_VERSION >= 3 + return module; +#endif +} \ No newline at end of file diff --git a/python/postal/text/token_types.py b/python/postal/text/token_types.py new file mode 100644 index 00000000..af161e2b --- /dev/null +++ b/python/postal/text/token_types.py @@ -0,0 +1,53 @@ +from postal.utils.enum import Enum, EnumValue + + +class token_types(Enum): + # Word types + WORD = EnumValue(1) + ABBREVIATION = EnumValue(2) + IDEOGRAPHIC_CHAR = EnumValue(3) + HANGUL_SYLLABLE = EnumValue(4) + ACRONYM = EnumValue(5) + + # Special tokens + EMAIL = EnumValue(20) + URL = EnumValue(21) + US_PHONE = EnumValue(22) + INTL_PHONE = EnumValue(23) + + # Numbers and numeric types + NUMERIC = EnumValue(50) + ORDINAL = EnumValue(51) + ROMAN_NUMERAL = EnumValue(52) + IDEOGRAPHIC_NUMBER = EnumValue(53) + + # Punctuation types, may separate a phrase + PERIOD = EnumValue(100) + EXCLAMATION = EnumValue(101) + QUESTION_MARK = EnumValue(102) + COMMA = EnumValue(103) + COLON = EnumValue(104) + SEMICOLON = EnumValue(105) + PLUS = EnumValue(106) + AMPERSAND = EnumValue(107) + AT_SIGN = EnumValue(108) + POUND = EnumValue(109) + ELLIPSIS = EnumValue(110) + DASH = EnumValue(111) + BREAKING_DASH = EnumValue(112) + HYPHEN = EnumValue(113) + PUNCT_OPEN = EnumValue(114) + PUNCT_CLOSE = EnumValue(115) + DOUBLE_QUOTE = EnumValue(119) + SINGLE_QUOTE = EnumValue(120) + OPEN_QUOTE = EnumValue(121) + CLOSE_QUOTE = EnumValue(122) + SLASH = EnumValue(124) + BACKSLASH = EnumValue(125) + GREATER_THAN = EnumValue(126) + LESS_THAN = EnumValue(127) + + # Non-letters and whitespace + OTHER = EnumValue(200) + WHITESPACE = EnumValue(300) + NEWLINE = EnumValue(301) diff --git a/python/postal/text/tokenize.py b/python/postal/text/tokenize.py new file mode 100644 index 00000000..f058dbc7 --- /dev/null +++ b/python/postal/text/tokenize.py @@ -0,0 +1,12 @@ +from postal.text.encoding import safe_decode +from postal.text import _tokenize +from postal.text.token_types import token_types + + +def tokenize_raw(s): + return _tokenize.tokenize(safe_decode(s)) + + +def tokenize(s): + return [(s[start:start + length], token_types.from_id(token_type)) + for start, length, token_type in _tokenize.tokenize(safe_decode(s))] diff --git a/python/postal/utils/__init__.py b/python/postal/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/postal/utils/enum.py b/python/postal/utils/enum.py new file mode 100644 index 00000000..504ad70a --- /dev/null +++ b/python/postal/utils/enum.py @@ -0,0 +1,62 @@ + +class EnumValue(object): + def __init__(self, value, name=None): + self.value = value + self.name = name + + def __hash__(self): + return self.value + + def __cmp__(self, other): + if isinstance(other, EnumValue): + return self.value.__cmp__(other.value) + else: + return self.value.__cmp__(other) + + def __unicode__(self): + return self.name + + def __str__(self): + return self.name + + def __repr__(self): + return self.name + + +class EnumMeta(type): + def __init__(self, name, bases, dict_): + self.registry = self.registry.copy() + self.name_registry = self.name_registry.copy() + for k, v in dict_.iteritems(): + if isinstance(v, EnumValue) and v not in self.registry: + if v.name is None: + v.name = k + self.registry[v.value] = v + self.name_registry[v.name] = v + return super(EnumMeta, self).__init__(name, bases, dict_) + + def __iter__(self): + return self.registry.itervalues() + + def __getitem__(self, key): + return self.registry[key] + + +class Enum(object): + __metaclass__ = EnumMeta + registry = {} + name_registry = {} + + @classmethod + def from_id(cls, value): + try: + return cls.registry[value] + except KeyError: + raise ValueError('Invalid value for {}: {}'.format(cls.__name__, value)) + + @classmethod + def from_string(cls, name): + try: + return cls.name_registry[name] + except KeyError: + raise ValueError('Invalid name for {}: {}'.format(cls.__name__, name))