[python] Adding initial pypostal bindings for tokenize so we can remove address_normalizer dependency. Not tested on Python 3.

This commit is contained in:
Al
2015-09-20 14:59:33 -04:00
parent 3fab0f984f
commit 5485ea2197
8 changed files with 319 additions and 0 deletions

View File

View File

View File

@@ -0,0 +1,34 @@
import six
text_type = six.text_type
string_types = six.string_types
binary_type = six.binary_type
def safe_decode(value, encoding='utf-8', errors='strict'):
if isinstance(value, text_type):
return value
if isinstance(value, (string_types, binary_type)):
return value.decode(encoding, errors)
else:
return binary_type(value).decode(encoding, errors)
def safe_encode(value, incoming=None, encoding='utf-8', errors='strict'):
if not isinstance(value, (string_types, binary_type)):
return binary_type(value)
if isinstance(value, text_type):
return value.encode(encoding, errors)
else:
if hasattr(incoming, 'lower'):
incoming = incoming.lower()
if hasattr(encoding, 'lower'):
encoding = encoding.lower()
if value and encoding != incoming:
value = safe_decode(value, encoding, errors)
return value.encode(encoding, errors)
else:
return value

View File

@@ -0,0 +1,158 @@
#include <Python.h>
#include "scanner.h"
#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif
struct module_state {
PyObject *error;
};
#ifdef IS_PY3K
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
#else
#define GETSTATE(m) (&_state)
static struct module_state _state;
#endif
static PyObject *py_tokenize(PyObject *self, PyObject *args)
{
PyObject *arg1;
if (!PyArg_ParseTuple(args, "O:tokenize", &arg1)) {
return 0;
}
PyObject *unistr = PyUnicode_FromObject(arg1);
if (unistr == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be converted to unicode in scanner");
return 0;
}
#ifdef IS_PY3K
// Python 3 encoding, supported by Python 3.3+
char *input = PyUnicode_AsUTF8(unistr);
#else
// Python 2 encoding
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
if (str == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be utf-8 encoded");
return 0;
}
char *input = PyBytes_AsString(str);
#endif
if (input == NULL) {
goto error_decref_str;
}
token_array *tokens = tokenize(input);
if (tokens == NULL) {
goto error_decref_str;
}
PyObject *result = PyList_New(0);
if (!result) {
token_array_destroy(tokens);
goto error_decref_unistr;
return 0;
}
PyObject *tuple;
token_t token;
for (int i = 0; i < tokens->n; i++) {
token = tokens->a[i];
tuple = Py_BuildValue("iii", token.offset, token.len, token.type);
PyList_Append(result, tuple);
Py_XDECREF(tuple);
}
Py_XDECREF(str);
Py_XDECREF(unistr);
token_array_destroy(tokens);
return result;
error_decref_str:
Py_XDECREF(str);
error_decref_unistr:
Py_XDECREF(unistr);
return 0;
}
static PyMethodDef tokenize_methods[] = {
{"tokenize", (PyCFunction)py_tokenize, METH_VARARGS, "tokenize(text)"},
{NULL, NULL},
};
#ifdef IS_PY3K
static int tokenize_traverse(PyObject *m, visitproc visit, void *arg) {
Py_VISIT(GETSTATE(m)->error);
return 0;
}
static int tokenize_clear(PyObject *m) {
Py_CLEAR(GETSTATE(m)->error);
return 0;
}
static struct PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"_tokenize",
NULL,
sizeof(struct module_state),
tokenize_methods,
NULL,
tokenize_traverse,
tokenize_clear,
NULL
};
#define INITERROR return NULL
PyObject *
PyInit_tokenize(void)
#else
#define INITERROR return
void
init_tokenize(void) {
#endif
#ifdef IS_PY3K
PyObject *module = PyModule_Create(&module_def);
#else
PyObject *module = Py_InitModule("_tokenize", tokenize_methods);
#endif
if (module == NULL)
INITERROR;
struct module_state *st = GETSTATE(module);
st->error = PyErr_NewException("_tokenize.Error", NULL, NULL);
if (st->error == NULL) {
Py_DECREF(module);
INITERROR;
}
#if PY_MAJOR_VERSION >= 3
return module;
#endif
}

View File

@@ -0,0 +1,53 @@
from postal.utils.enum import Enum, EnumValue
class token_types(Enum):
# Word types
WORD = EnumValue(1)
ABBREVIATION = EnumValue(2)
IDEOGRAPHIC_CHAR = EnumValue(3)
HANGUL_SYLLABLE = EnumValue(4)
ACRONYM = EnumValue(5)
# Special tokens
EMAIL = EnumValue(20)
URL = EnumValue(21)
US_PHONE = EnumValue(22)
INTL_PHONE = EnumValue(23)
# Numbers and numeric types
NUMERIC = EnumValue(50)
ORDINAL = EnumValue(51)
ROMAN_NUMERAL = EnumValue(52)
IDEOGRAPHIC_NUMBER = EnumValue(53)
# Punctuation types, may separate a phrase
PERIOD = EnumValue(100)
EXCLAMATION = EnumValue(101)
QUESTION_MARK = EnumValue(102)
COMMA = EnumValue(103)
COLON = EnumValue(104)
SEMICOLON = EnumValue(105)
PLUS = EnumValue(106)
AMPERSAND = EnumValue(107)
AT_SIGN = EnumValue(108)
POUND = EnumValue(109)
ELLIPSIS = EnumValue(110)
DASH = EnumValue(111)
BREAKING_DASH = EnumValue(112)
HYPHEN = EnumValue(113)
PUNCT_OPEN = EnumValue(114)
PUNCT_CLOSE = EnumValue(115)
DOUBLE_QUOTE = EnumValue(119)
SINGLE_QUOTE = EnumValue(120)
OPEN_QUOTE = EnumValue(121)
CLOSE_QUOTE = EnumValue(122)
SLASH = EnumValue(124)
BACKSLASH = EnumValue(125)
GREATER_THAN = EnumValue(126)
LESS_THAN = EnumValue(127)
# Non-letters and whitespace
OTHER = EnumValue(200)
WHITESPACE = EnumValue(300)
NEWLINE = EnumValue(301)

View File

@@ -0,0 +1,12 @@
from postal.text.encoding import safe_decode
from postal.text import _tokenize
from postal.text.token_types import token_types
def tokenize_raw(s):
return _tokenize.tokenize(safe_decode(s))
def tokenize(s):
return [(s[start:start + length], token_types.from_id(token_type))
for start, length, token_type in _tokenize.tokenize(safe_decode(s))]

View File

View File

@@ -0,0 +1,62 @@
class EnumValue(object):
def __init__(self, value, name=None):
self.value = value
self.name = name
def __hash__(self):
return self.value
def __cmp__(self, other):
if isinstance(other, EnumValue):
return self.value.__cmp__(other.value)
else:
return self.value.__cmp__(other)
def __unicode__(self):
return self.name
def __str__(self):
return self.name
def __repr__(self):
return self.name
class EnumMeta(type):
def __init__(self, name, bases, dict_):
self.registry = self.registry.copy()
self.name_registry = self.name_registry.copy()
for k, v in dict_.iteritems():
if isinstance(v, EnumValue) and v not in self.registry:
if v.name is None:
v.name = k
self.registry[v.value] = v
self.name_registry[v.name] = v
return super(EnumMeta, self).__init__(name, bases, dict_)
def __iter__(self):
return self.registry.itervalues()
def __getitem__(self, key):
return self.registry[key]
class Enum(object):
__metaclass__ = EnumMeta
registry = {}
name_registry = {}
@classmethod
def from_id(cls, value):
try:
return cls.registry[value]
except KeyError:
raise ValueError('Invalid value for {}: {}'.format(cls.__name__, value))
@classmethod
def from_string(cls, name):
try:
return cls.name_registry[name]
except KeyError:
raise ValueError('Invalid name for {}: {}'.format(cls.__name__, name))