[rm] Removing Python bindings from this project, moving to https://github.com/openvenues/pypostal
This commit is contained in:
@@ -1,16 +0,0 @@
|
||||
import _expand
|
||||
from postal.text.encoding import safe_decode
|
||||
|
||||
DEFAULT_LANGUAGES = ('en',)
|
||||
|
||||
|
||||
def expand_address(address, languages=DEFAULT_LANGUAGES, **kw):
|
||||
'''
|
||||
@param address: the address as either Unicode or a UTF-8 encoded string
|
||||
@param languages: a tuple or list of ISO language code strings (e.g. "en", "fr", "de", etc.)
|
||||
to use in expansion. Default is English. Until automatic language classification
|
||||
is ready in libpostal, this parameter is required.
|
||||
|
||||
'''
|
||||
address = safe_decode(address, 'utf-8')
|
||||
return _expand.expand_address(address, languages=languages, **kw)
|
||||
@@ -1,14 +0,0 @@
|
||||
import _parser
|
||||
from postal.text.encoding import safe_decode
|
||||
|
||||
DEFAULT_LANGUAGES = ('en',)
|
||||
|
||||
|
||||
def parse_address(address, language=None, country=None):
|
||||
'''
|
||||
@param address: the address as either Unicode or a UTF-8 encoded string
|
||||
@param language (optional): language code
|
||||
@param country (optional): country code
|
||||
'''
|
||||
address = safe_decode(address, 'utf-8')
|
||||
return _parser.parse_address(address, language=language, country=country)
|
||||
@@ -1,346 +0,0 @@
|
||||
#include <Python.h>
|
||||
#include <libpostal/libpostal.h>
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
#define IS_PY3K
|
||||
#endif
|
||||
|
||||
struct module_state {
|
||||
PyObject *error;
|
||||
};
|
||||
|
||||
|
||||
#ifdef IS_PY3K
|
||||
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
|
||||
#else
|
||||
#define GETSTATE(m) (&_state)
|
||||
static struct module_state _state;
|
||||
#endif
|
||||
|
||||
|
||||
static PyObject *py_expand(PyObject *self, PyObject *args, PyObject *keywords) {
|
||||
PyObject *arg_input;
|
||||
PyObject *arg_languages;
|
||||
normalize_options_t options = LIBPOSTAL_DEFAULT_OPTIONS;
|
||||
|
||||
PyObject *result = NULL;
|
||||
|
||||
static char *kwlist[] = {"address",
|
||||
"languages",
|
||||
"address_components",
|
||||
"latin_ascii",
|
||||
"transliterate",
|
||||
"strip_accents",
|
||||
"decompose",
|
||||
"lowercase",
|
||||
"trim_string",
|
||||
"replace_word_hyphens",
|
||||
"delete_word_hyphens",
|
||||
"replace_numeric_hyphens",
|
||||
"delete_numeric_hyphens",
|
||||
"split_alpha_from_numeric",
|
||||
"delete_final_periods",
|
||||
"delete_acronym_periods",
|
||||
"drop_english_possessives",
|
||||
"delete_apostrophes",
|
||||
"expand_numex",
|
||||
"roman_numerals",
|
||||
NULL
|
||||
};
|
||||
|
||||
uint32_t address_components = options.address_components;
|
||||
uint32_t latin_ascii = options.latin_ascii;
|
||||
uint32_t transliterate = options.transliterate;
|
||||
uint32_t strip_accents = options.strip_accents;
|
||||
uint32_t decompose = options.decompose;
|
||||
uint32_t lowercase = options.lowercase;
|
||||
uint32_t trim_string = options.trim_string;
|
||||
uint32_t replace_word_hyphens = options.replace_word_hyphens;
|
||||
uint32_t delete_word_hyphens = options.delete_word_hyphens;
|
||||
uint32_t replace_numeric_hyphens = options.replace_numeric_hyphens;
|
||||
uint32_t delete_numeric_hyphens = options.delete_numeric_hyphens;
|
||||
uint32_t split_alpha_from_numeric = options.split_alpha_from_numeric;
|
||||
uint32_t delete_final_periods = options.delete_final_periods;
|
||||
uint32_t delete_acronym_periods = options.delete_acronym_periods;
|
||||
uint32_t expand_numex = options.expand_numex;
|
||||
uint32_t roman_numerals = options.roman_numerals;
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywords,
|
||||
"O|OHIIIIIIIIIIIIIIIIII:pyexpand", kwlist,
|
||||
&arg_input, &arg_languages,
|
||||
&address_components,
|
||||
&latin_ascii,
|
||||
&transliterate,
|
||||
&strip_accents,
|
||||
&decompose,
|
||||
&lowercase,
|
||||
&trim_string,
|
||||
&replace_word_hyphens,
|
||||
&delete_word_hyphens,
|
||||
&replace_numeric_hyphens,
|
||||
&delete_numeric_hyphens,
|
||||
&split_alpha_from_numeric,
|
||||
&delete_final_periods,
|
||||
&delete_acronym_periods,
|
||||
&expand_numex,
|
||||
&roman_numerals
|
||||
)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
options.address_components = address_components;
|
||||
options.latin_ascii = latin_ascii;
|
||||
options.transliterate = transliterate;
|
||||
options.strip_accents = strip_accents;
|
||||
options.decompose = decompose;
|
||||
options.lowercase = lowercase;
|
||||
options.trim_string = trim_string;
|
||||
options.replace_word_hyphens = replace_word_hyphens;
|
||||
options.delete_word_hyphens = delete_word_hyphens;
|
||||
options.replace_numeric_hyphens = replace_numeric_hyphens;
|
||||
options.delete_numeric_hyphens = delete_numeric_hyphens;
|
||||
options.split_alpha_from_numeric = split_alpha_from_numeric;
|
||||
options.delete_final_periods = delete_final_periods;
|
||||
options.delete_acronym_periods = delete_acronym_periods;
|
||||
options.expand_numex = expand_numex;
|
||||
options.roman_numerals = roman_numerals;
|
||||
|
||||
PyObject *unistr_input = PyUnicode_FromObject(arg_input);
|
||||
if (unistr_input == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Input could not be converted to unicode");
|
||||
return 0;
|
||||
}
|
||||
|
||||
char *input = NULL;
|
||||
|
||||
#ifdef IS_PY3K
|
||||
// Python 3 encoding, supported by Python 3.3+
|
||||
|
||||
input = PyUnicode_AsUTF8(unistr_input);
|
||||
|
||||
#else
|
||||
// Python 2 encoding
|
||||
|
||||
PyObject *str_input = PyUnicode_AsEncodedString(unistr_input, "utf-8", "strict");
|
||||
if (str_input == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Input could not be utf-8 encoded");
|
||||
return 0;
|
||||
}
|
||||
|
||||
input = PyBytes_AsString(str_input);
|
||||
#endif
|
||||
|
||||
if (input == NULL) {
|
||||
goto exit_decref_str;
|
||||
}
|
||||
|
||||
char **languages = NULL;
|
||||
size_t num_languages = 0;
|
||||
|
||||
if (PySequence_Check(arg_languages)) {
|
||||
PyObject *seq = PySequence_Fast(arg_languages, "Expected a sequence");
|
||||
Py_ssize_t len_languages = PySequence_Length(arg_languages);
|
||||
|
||||
if (len_languages > 0) {
|
||||
languages = malloc(len_languages * sizeof(char *));
|
||||
if (languages == NULL) {
|
||||
goto exit_decref_str;
|
||||
}
|
||||
|
||||
char *language = NULL;
|
||||
|
||||
for (int i = 0; i < len_languages; i++) {
|
||||
PyObject *item = PySequence_Fast_GET_ITEM(seq, i);
|
||||
|
||||
language = NULL;
|
||||
|
||||
#if IS_PY3K
|
||||
|
||||
if (PyBytes_Check(item)) {
|
||||
language = PyBytes_AsString(item);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
if (PyString_Check(item)) {
|
||||
language = PyString_AsString(item);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
if (language != NULL && item != Py_None) {
|
||||
if (strlen(language) >= MAX_LANGUAGE_LEN) {
|
||||
PyErr_SetString(PyExc_TypeError, "language was longer than a language code");
|
||||
free(languages);
|
||||
Py_DECREF(seq);
|
||||
goto exit_decref_str;
|
||||
}
|
||||
languages[num_languages] = strdup(language);
|
||||
num_languages++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (num_languages > 0) {
|
||||
options.languages = languages;
|
||||
options.num_languages = (int)num_languages;
|
||||
} else {
|
||||
free(languages);
|
||||
languages = NULL;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Py_DECREF(seq);
|
||||
}
|
||||
|
||||
if (languages == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError, "Must specify languages=[list of language codes] to expand_address");
|
||||
goto exit_decref_str;
|
||||
}
|
||||
|
||||
|
||||
size_t num_expansions = 0;
|
||||
char **expansions = expand_address(input, options, &num_expansions);
|
||||
|
||||
if (languages != NULL) {
|
||||
for (int i = 0; i < num_languages; i++) {
|
||||
free(languages[i]);
|
||||
}
|
||||
free(languages);
|
||||
}
|
||||
|
||||
if (expansions == NULL) {
|
||||
goto exit_decref_str;
|
||||
}
|
||||
|
||||
result = PyList_New((Py_ssize_t)num_expansions);
|
||||
if (!result) {
|
||||
goto exit_free_expansions;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_expansions; i++) {
|
||||
char *expansion = expansions[i];
|
||||
PyObject *u = PyUnicode_DecodeUTF8((const char *)expansion, strlen(expansion), "strict");
|
||||
if (u == NULL) {
|
||||
Py_DECREF(result);
|
||||
goto exit_free_expansions;
|
||||
}
|
||||
// Note: PyList_SetItem steals a reference, so don't worry about DECREF
|
||||
PyList_SetItem(result, (Py_ssize_t)i, u);
|
||||
}
|
||||
|
||||
exit_free_expansions:
|
||||
for (int i = 0; i < num_expansions; i++) {
|
||||
free(expansions[i]);
|
||||
}
|
||||
free(expansions);
|
||||
exit_decref_str:
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str_input);
|
||||
#endif
|
||||
exit_decref_unistr:
|
||||
Py_XDECREF(unistr_input);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static PyMethodDef expand_methods[] = {
|
||||
{"expand_address", (PyCFunction)py_expand, METH_VARARGS | METH_KEYWORDS, "expand_address(text, **kw)"},
|
||||
{NULL, NULL},
|
||||
};
|
||||
|
||||
|
||||
|
||||
#ifdef IS_PY3K
|
||||
|
||||
static int expand_traverse(PyObject *m, visitproc visit, void *arg) {
|
||||
Py_VISIT(GETSTATE(m)->error);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int expand_clear(PyObject *m) {
|
||||
Py_CLEAR(GETSTATE(m)->error);
|
||||
libpostal_teardown();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct PyModuleDef module_def = {
|
||||
PyModuleDef_HEAD_INIT,
|
||||
"_expand",
|
||||
NULL,
|
||||
sizeof(struct module_state),
|
||||
expand_methods,
|
||||
NULL,
|
||||
expand_traverse,
|
||||
expand_clear,
|
||||
NULL
|
||||
};
|
||||
|
||||
#define INITERROR return NULL
|
||||
|
||||
PyObject *
|
||||
PyInit_expand(void) {
|
||||
|
||||
#else
|
||||
|
||||
#define INITERROR return
|
||||
|
||||
void cleanup_libpostal(void) {
|
||||
libpostal_teardown();
|
||||
}
|
||||
|
||||
void
|
||||
init_expand(void) {
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef IS_PY3K
|
||||
PyObject *module = PyModule_Create(&module_def);
|
||||
#else
|
||||
PyObject *module = Py_InitModule("_expand", expand_methods);
|
||||
#endif
|
||||
|
||||
if (module == NULL) {
|
||||
INITERROR;
|
||||
}
|
||||
struct module_state *st = GETSTATE(module);
|
||||
|
||||
st->error = PyErr_NewException("_expand.Error", NULL, NULL);
|
||||
if (st->error == NULL) {
|
||||
Py_DECREF(module);
|
||||
INITERROR;
|
||||
}
|
||||
|
||||
if (!libpostal_setup()) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Error loading libpostal");
|
||||
}
|
||||
|
||||
PyModule_AddIntConstant(module, "ADDRESS_ANY", ADDRESS_ANY);
|
||||
PyModule_AddIntConstant(module, "ADDRESS_NAME", ADDRESS_NAME);
|
||||
PyModule_AddIntConstant(module, "ADDRESS_HOUSE_NUMBER", ADDRESS_HOUSE_NUMBER);
|
||||
PyModule_AddIntConstant(module, "ADDRESS_STREET", ADDRESS_STREET);
|
||||
PyModule_AddIntConstant(module, "ADDRESS_UNIT", ADDRESS_UNIT);
|
||||
PyModule_AddIntConstant(module, "ADDRESS_LOCALITY", ADDRESS_LOCALITY);
|
||||
PyModule_AddIntConstant(module, "ADDRESS_ADMIN1", ADDRESS_ADMIN1);
|
||||
PyModule_AddIntConstant(module, "ADDRESS_ADMIN2", ADDRESS_ADMIN2);
|
||||
PyModule_AddIntConstant(module, "ADDRESS_ADMIN3", ADDRESS_ADMIN3);
|
||||
PyModule_AddIntConstant(module, "ADDRESS_ADMIN4", ADDRESS_ADMIN4);
|
||||
PyModule_AddIntConstant(module, "ADDRESS_ADMIN_OTHER", ADDRESS_ADMIN_OTHER);
|
||||
PyModule_AddIntConstant(module, "ADDRESS_COUNTRY", ADDRESS_COUNTRY);
|
||||
PyModule_AddIntConstant(module, "ADDRESS_NEIGHBORHOOD", ADDRESS_NEIGHBORHOOD);
|
||||
PyModule_AddIntConstant(module, "ADDRESS_ALL", ADDRESS_ALL);
|
||||
|
||||
#ifndef IS_PY3K
|
||||
Py_AtExit(&cleanup_libpostal);
|
||||
#endif
|
||||
|
||||
#if IS_PY3K
|
||||
return module;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1,299 +0,0 @@
|
||||
#include <Python.h>
|
||||
#include <libpostal/libpostal.h>
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
#define IS_PY3K
|
||||
#endif
|
||||
|
||||
struct module_state {
|
||||
PyObject *error;
|
||||
};
|
||||
|
||||
|
||||
#ifdef IS_PY3K
|
||||
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
|
||||
#else
|
||||
#define GETSTATE(m) (&_state)
|
||||
static struct module_state _state;
|
||||
#endif
|
||||
|
||||
|
||||
static PyObject *py_parse_address(PyObject *self, PyObject *args, PyObject *keywords) {
|
||||
PyObject *arg_input;
|
||||
PyObject *arg_language = Py_None;
|
||||
PyObject *arg_country = Py_None;
|
||||
|
||||
PyObject *result = NULL;
|
||||
|
||||
static char *kwlist[] = {"address",
|
||||
"language",
|
||||
"country",
|
||||
NULL
|
||||
};
|
||||
|
||||
|
||||
if (!PyArg_ParseTupleAndKeywords(args, keywords,
|
||||
"O|OO:pyparser", kwlist,
|
||||
&arg_input, &arg_language,
|
||||
&arg_country
|
||||
)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyObject *unistr_input = PyUnicode_FromObject(arg_input);
|
||||
if (unistr_input == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Input could not be converted to unicode");
|
||||
return 0;
|
||||
}
|
||||
|
||||
char *input = NULL;
|
||||
|
||||
#ifdef IS_PY3K
|
||||
// Python 3 encoding, supported by Python 3.3+
|
||||
|
||||
input = PyUnicode_AsUTF8(unistr_input);
|
||||
|
||||
#else
|
||||
// Python 2 encoding
|
||||
|
||||
PyObject *str_input = PyUnicode_AsEncodedString(unistr_input, "utf-8", "strict");
|
||||
if (str_input == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Input could not be utf-8 encoded");
|
||||
goto exit_decref_input_unistr;
|
||||
}
|
||||
|
||||
input = PyBytes_AsString(str_input);
|
||||
#endif
|
||||
|
||||
if (input == NULL) {
|
||||
goto exit_decref_input_str;
|
||||
}
|
||||
|
||||
char *language = NULL;
|
||||
|
||||
PyObject *unistr_language = Py_None;
|
||||
PyObject *str_language = Py_None;
|
||||
|
||||
if (arg_language != Py_None) {
|
||||
unistr_language = PyUnicode_FromObject(arg_language);
|
||||
if (unistr_language == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Language could not be converted to unicode");
|
||||
}
|
||||
|
||||
#ifdef IS_PY3K
|
||||
// Python 3 encoding, supported by Python 3.3+
|
||||
|
||||
language = PyUnicode_AsUTF8(unistr_language);
|
||||
|
||||
#else
|
||||
// Python 2 encoding
|
||||
|
||||
PyObject *str_language = PyUnicode_AsEncodedString(unistr_language, "utf-8", "strict");
|
||||
if (str_language == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Language could not be utf-8 encoded");
|
||||
goto exit_decref_language_unistr;
|
||||
}
|
||||
|
||||
language = PyBytes_AsString(str_language);
|
||||
#endif
|
||||
|
||||
if (language == NULL) {
|
||||
goto exit_decref_language_str;
|
||||
}
|
||||
}
|
||||
|
||||
char *country = NULL;
|
||||
PyObject *unistr_country = Py_None;
|
||||
PyObject *str_country = Py_None;
|
||||
|
||||
if (arg_country != Py_None) {
|
||||
unistr_country = PyUnicode_FromObject(arg_country);
|
||||
if (unistr_country == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Country could not be converted to unicode");
|
||||
}
|
||||
|
||||
#ifdef IS_PY3K
|
||||
// Python 3 encoding, supported by Python 3.3+
|
||||
|
||||
country = PyUnicode_AsUTF8(unistr_country);
|
||||
|
||||
#else
|
||||
// Python 2 encoding
|
||||
|
||||
PyObject *str_country = PyUnicode_AsEncodedString(unistr_country, "utf-8", "strict");
|
||||
if (str_country == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Country could not be utf-8 encoded");
|
||||
goto exit_decref_country_unistr;
|
||||
}
|
||||
|
||||
country = PyBytes_AsString(str_country);
|
||||
#endif
|
||||
|
||||
if (country == NULL) {
|
||||
goto exit_decref_country_str;
|
||||
}
|
||||
}
|
||||
|
||||
address_parser_options_t options = LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS;
|
||||
options.language = language;
|
||||
options.country = country;
|
||||
|
||||
address_parser_response_t *parsed = parse_address(input, options);
|
||||
if (parsed == NULL) {
|
||||
goto exit_decref_country_str;
|
||||
}
|
||||
|
||||
result = PyList_New((Py_ssize_t)parsed->num_components);
|
||||
if (!result) {
|
||||
goto exit_destroy_response;
|
||||
}
|
||||
|
||||
for (int i = 0; i < parsed->num_components; i++) {
|
||||
char *component = parsed->components[i];
|
||||
char *label = parsed->labels[i];
|
||||
PyObject *component_unicode = PyUnicode_DecodeUTF8((const char *)component, strlen(component), "strict");
|
||||
if (component_unicode == NULL) {
|
||||
Py_DECREF(result);
|
||||
goto exit_destroy_response;
|
||||
}
|
||||
|
||||
PyObject *label_unicode = PyUnicode_DecodeUTF8((const char *)label, strlen(label), "strict");
|
||||
if (label_unicode == NULL) {
|
||||
Py_DECREF(component_unicode);
|
||||
Py_DECREF(result);
|
||||
goto exit_destroy_response;
|
||||
}
|
||||
PyObject *tuple = Py_BuildValue("(OO)", component_unicode, label_unicode);
|
||||
if (tuple == NULL) {
|
||||
Py_DECREF(component_unicode);
|
||||
Py_DECREF(label_unicode);
|
||||
goto exit_destroy_response;
|
||||
}
|
||||
|
||||
// Note: PyList_SetItem steals a reference, so don't worry about DECREF
|
||||
PyList_SetItem(result, (Py_ssize_t)i, tuple);
|
||||
|
||||
Py_DECREF(component_unicode);
|
||||
Py_DECREF(label_unicode);
|
||||
}
|
||||
|
||||
exit_destroy_response:
|
||||
address_parser_response_destroy(parsed);
|
||||
exit_decref_country_str:
|
||||
#ifndef IS_PY3K
|
||||
if (str_country != Py_None) {
|
||||
Py_XDECREF(str_country);
|
||||
}
|
||||
#endif
|
||||
exit_decref_country_unistr:
|
||||
if (unistr_country != Py_None) {
|
||||
Py_XDECREF(unistr_country);
|
||||
}
|
||||
exit_decref_language_str:
|
||||
#ifndef IS_PY3K
|
||||
if (str_language != Py_None) {
|
||||
Py_XDECREF(str_language);
|
||||
}
|
||||
#endif
|
||||
exit_decref_language_unistr:
|
||||
if (unistr_language != Py_None) {
|
||||
Py_XDECREF(unistr_language);
|
||||
}
|
||||
exit_decref_input_str:
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str_input);
|
||||
#endif
|
||||
exit_decref_input_unistr:
|
||||
Py_XDECREF(unistr_input);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static PyMethodDef parser_methods[] = {
|
||||
{"parse_address", (PyCFunction)py_parse_address, METH_VARARGS | METH_KEYWORDS, "parse_address(text, language, country)"},
|
||||
{NULL, NULL},
|
||||
};
|
||||
|
||||
|
||||
|
||||
#ifdef IS_PY3K
|
||||
|
||||
static int parser_traverse(PyObject *m, visitproc visit, void *arg) {
|
||||
Py_VISIT(GETSTATE(m)->error);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int parser_clear(PyObject *m) {
|
||||
Py_CLEAR(GETSTATE(m)->error);
|
||||
libpostal_teardown();
|
||||
libpostal_teardown_parser();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct PyModuleDef module_def = {
|
||||
PyModuleDef_HEAD_INIT,
|
||||
"_parser",
|
||||
NULL,
|
||||
sizeof(struct module_state),
|
||||
parser_methods,
|
||||
NULL,
|
||||
parser_traverse,
|
||||
parser_clear,
|
||||
NULL
|
||||
};
|
||||
|
||||
#define INITERROR return NULL
|
||||
|
||||
PyObject *
|
||||
PyInit_parser(void) {
|
||||
#else
|
||||
|
||||
#define INITERROR return
|
||||
|
||||
void cleanup_libpostal(void) {
|
||||
libpostal_teardown();
|
||||
libpostal_teardown_parser();
|
||||
}
|
||||
|
||||
void
|
||||
init_parser(void) {
|
||||
#endif
|
||||
|
||||
#ifdef IS_PY3K
|
||||
PyObject *module = PyModule_Create(&module_def);
|
||||
#else
|
||||
PyObject *module = Py_InitModule("_parser", parser_methods);
|
||||
#endif
|
||||
|
||||
if (module == NULL) {
|
||||
INITERROR;
|
||||
}
|
||||
struct module_state *st = GETSTATE(module);
|
||||
|
||||
st->error = PyErr_NewException("_parser.Error", NULL, NULL);
|
||||
if (st->error == NULL) {
|
||||
Py_DECREF(module);
|
||||
INITERROR;
|
||||
}
|
||||
|
||||
if (!libpostal_setup() || !libpostal_setup_parser()) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Error loading libpostal data");
|
||||
}
|
||||
|
||||
#ifndef IS_PY3K
|
||||
Py_AtExit(&cleanup_libpostal);
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef IS_PY3K
|
||||
return module;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
import six
|
||||
|
||||
text_type = six.text_type
|
||||
string_types = six.string_types
|
||||
binary_type = six.binary_type
|
||||
|
||||
|
||||
def safe_decode(value, encoding='utf-8', errors='strict'):
|
||||
if isinstance(value, text_type):
|
||||
return value
|
||||
|
||||
if isinstance(value, (string_types, binary_type)):
|
||||
return value.decode(encoding, errors)
|
||||
else:
|
||||
return binary_type(value).decode(encoding, errors)
|
||||
|
||||
|
||||
def safe_encode(value, incoming=None, encoding='utf-8', errors='strict'):
|
||||
if not isinstance(value, (string_types, binary_type)):
|
||||
return binary_type(value)
|
||||
|
||||
if isinstance(value, text_type):
|
||||
return value.encode(encoding, errors)
|
||||
else:
|
||||
if hasattr(incoming, 'lower'):
|
||||
incoming = incoming.lower()
|
||||
if hasattr(encoding, 'lower'):
|
||||
encoding = encoding.lower()
|
||||
|
||||
if value and encoding != incoming:
|
||||
value = safe_decode(value, encoding, errors)
|
||||
return value.encode(encoding, errors)
|
||||
else:
|
||||
return value
|
||||
@@ -1,84 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from postal.text import _normalize
|
||||
from postal.text.tokenize import tokenize_raw
|
||||
from postal.text.token_types import token_types
|
||||
|
||||
from postal.text.encoding import safe_decode
|
||||
|
||||
# String options
|
||||
NORMALIZE_STRING_LATIN_ASCII = _normalize.NORMALIZE_STRING_LATIN_ASCII
|
||||
NORMALIZE_STRING_TRANSLITERATE = _normalize.NORMALIZE_STRING_TRANSLITERATE
|
||||
NORMALIZE_STRING_STRIP_ACCENTS = _normalize.NORMALIZE_STRING_STRIP_ACCENTS
|
||||
NORMALIZE_STRING_DECOMPOSE = _normalize.NORMALIZE_STRING_DECOMPOSE
|
||||
NORMALIZE_STRING_LOWERCASE = _normalize.NORMALIZE_STRING_LOWERCASE
|
||||
NORMALIZE_STRING_TRIM = _normalize.NORMALIZE_STRING_TRIM
|
||||
NORMALIZE_STRING_REPLACE_HYPHENS = _normalize.NORMALIZE_STRING_REPLACE_HYPHENS
|
||||
|
||||
DEFAULT_STRING_OPTIONS = NORMALIZE_STRING_LATIN_ASCII | \
|
||||
NORMALIZE_STRING_DECOMPOSE | \
|
||||
NORMALIZE_STRING_TRIM | \
|
||||
NORMALIZE_STRING_REPLACE_HYPHENS | \
|
||||
NORMALIZE_STRING_STRIP_ACCENTS | \
|
||||
NORMALIZE_STRING_LOWERCASE
|
||||
|
||||
# Token options
|
||||
NORMALIZE_TOKEN_REPLACE_HYPHENS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS
|
||||
NORMALIZE_TOKEN_DELETE_HYPHENS = _normalize.NORMALIZE_TOKEN_DELETE_HYPHENS
|
||||
NORMALIZE_TOKEN_DELETE_FINAL_PERIOD = _normalize.NORMALIZE_TOKEN_DELETE_FINAL_PERIOD
|
||||
NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS = _normalize.NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS
|
||||
NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES = _normalize.NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES
|
||||
NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE = _normalize.NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE
|
||||
NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC = _normalize.NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC
|
||||
NORMALIZE_TOKEN_REPLACE_DIGITS = _normalize.NORMALIZE_TOKEN_REPLACE_DIGITS
|
||||
|
||||
DEFAULT_TOKEN_OPTIONS = NORMALIZE_TOKEN_REPLACE_HYPHENS | \
|
||||
NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \
|
||||
NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \
|
||||
NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \
|
||||
NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE | \
|
||||
NORMALIZE_TOKEN_REPLACE_DIGITS
|
||||
|
||||
|
||||
def remove_parens(tokens):
|
||||
new_tokens = []
|
||||
open_parens = 0
|
||||
for t, c in tokens:
|
||||
if c == token_types.PUNCT_OPEN:
|
||||
open_parens += 1
|
||||
elif c == token_types.PUNCT_CLOSE:
|
||||
if open_parens > 0:
|
||||
open_parens -= 1
|
||||
elif open_parens <= 0:
|
||||
new_tokens.append((t, c))
|
||||
return new_tokens
|
||||
|
||||
|
||||
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
|
||||
token_options=DEFAULT_TOKEN_OPTIONS,
|
||||
strip_parentheticals=True):
|
||||
'''
|
||||
Normalizes a string, tokenizes, and normalizes each token
|
||||
with string and token-level options.
|
||||
|
||||
This version only uses libpostal's deterministic normalizations
|
||||
i.e. methods with a single output. The string tree version will
|
||||
return multiple normalized strings, each with tokens.
|
||||
|
||||
Usage:
|
||||
normalized_tokens(u'St.-Barthélemy')
|
||||
'''
|
||||
s = safe_decode(s)
|
||||
if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII:
|
||||
normalized = _normalize.normalize_string_latin(s, string_options)
|
||||
else:
|
||||
normalized = _normalize.normalize_string_utf8(s, string_options)
|
||||
|
||||
# Tuples of (offset, len, type)
|
||||
raw_tokens = tokenize_raw(normalized)
|
||||
tokens = [(_normalize.normalize_token(normalized, t, token_options),
|
||||
token_types.from_id(t[-1])) for t in raw_tokens]
|
||||
|
||||
if strip_parentheticals:
|
||||
return remove_parens(tokens)
|
||||
else:
|
||||
return tokens
|
||||
@@ -1,328 +0,0 @@
|
||||
#include <Python.h>
|
||||
|
||||
#include "src/normalize.h"
|
||||
#include "src/transliterate.h"
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
#define IS_PY3K
|
||||
#endif
|
||||
|
||||
struct module_state {
|
||||
PyObject *error;
|
||||
};
|
||||
|
||||
|
||||
#ifdef IS_PY3K
|
||||
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
|
||||
#else
|
||||
#define GETSTATE(m) (&_state)
|
||||
static struct module_state _state;
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *arg1;
|
||||
uint64_t options;
|
||||
if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyObject *unistr = PyUnicode_FromObject(arg1);
|
||||
if (unistr == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Parameter could not be converted to unicode in scanner");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef IS_PY3K
|
||||
// Python 3 encoding, supported by Python 3.3+
|
||||
|
||||
char *input = PyUnicode_AsUTF8(unistr);
|
||||
|
||||
#else
|
||||
// Python 2 encoding
|
||||
|
||||
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
|
||||
if (str == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Parameter could not be utf-8 encoded");
|
||||
goto exit_decref_unistr;
|
||||
}
|
||||
|
||||
char *input = PyBytes_AsString(str);
|
||||
|
||||
#endif
|
||||
|
||||
if (input == NULL) {
|
||||
goto exit_decref_str;
|
||||
}
|
||||
|
||||
char *normalized = normalize_string_utf8(input, options);
|
||||
|
||||
if (normalized == NULL) {
|
||||
goto exit_decref_str;
|
||||
}
|
||||
|
||||
PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict");
|
||||
free(normalized);
|
||||
if (result == NULL) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"Result could not be utf-8 decoded");
|
||||
goto exit_decref_str;
|
||||
}
|
||||
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str);
|
||||
#endif
|
||||
Py_XDECREF(unistr);
|
||||
|
||||
return result;
|
||||
|
||||
exit_decref_str:
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str);
|
||||
#endif
|
||||
exit_decref_unistr:
|
||||
Py_XDECREF(unistr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static PyObject *py_normalize_string_latin(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *arg1;
|
||||
uint64_t options;
|
||||
if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyObject *unistr = PyUnicode_FromObject(arg1);
|
||||
if (unistr == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Parameter could not be converted to unicode in scanner");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef IS_PY3K
|
||||
// Python 3 encoding, supported by Python 3.3+
|
||||
|
||||
char *input = PyUnicode_AsUTF8(unistr);
|
||||
|
||||
#else
|
||||
// Python 2 encoding
|
||||
|
||||
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
|
||||
if (str == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Parameter could not be utf-8 encoded");
|
||||
goto exit_decref_unistr;
|
||||
}
|
||||
|
||||
char *input = PyBytes_AsString(str);
|
||||
|
||||
#endif
|
||||
|
||||
if (input == NULL) {
|
||||
goto exit_decref_str;
|
||||
}
|
||||
|
||||
char *normalized = normalize_string_latin(input, strlen(input), options);
|
||||
|
||||
PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict");
|
||||
free(normalized);
|
||||
if (result == NULL) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"Result could not be utf-8 decoded");
|
||||
goto exit_decref_str;
|
||||
}
|
||||
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str);
|
||||
#endif
|
||||
Py_XDECREF(unistr);
|
||||
|
||||
return result;
|
||||
|
||||
exit_decref_str:
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str);
|
||||
#endif
|
||||
exit_decref_unistr:
|
||||
Py_XDECREF(unistr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static PyObject *py_normalize_token(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *s;
|
||||
|
||||
uint32_t offset;
|
||||
uint32_t len;
|
||||
uint16_t type;
|
||||
|
||||
uint64_t options;
|
||||
if (!PyArg_ParseTuple(args, "O(IIH)K:normalize", &s, &offset, &len, &type, &options)) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Error parsing arguments");
|
||||
return 0;
|
||||
}
|
||||
|
||||
token_t token = (token_t){(size_t)offset, (size_t)len, type};
|
||||
|
||||
PyObject *unistr = PyUnicode_FromObject(s);
|
||||
if (unistr == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Parameter could not be converted to unicode in scanner");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef IS_PY3K
|
||||
// Python 3 encoding, supported by Python 3.3+
|
||||
|
||||
char *input = PyUnicode_AsUTF8(unistr);
|
||||
|
||||
#else
|
||||
// Python 2 encoding
|
||||
|
||||
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
|
||||
if (str == NULL) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"Parameter could not be utf-8 encoded");
|
||||
goto exit_decref_unistr;
|
||||
}
|
||||
|
||||
char *input = PyBytes_AsString(str);
|
||||
|
||||
#endif
|
||||
|
||||
if (input == NULL) {
|
||||
goto exit_decref_str;
|
||||
}
|
||||
|
||||
char_array *token_buffer = char_array_new_size(token.len);
|
||||
|
||||
add_normalized_token(token_buffer, input, token, options);
|
||||
char *token_str = char_array_get_string(token_buffer);
|
||||
PyObject *result = PyUnicode_DecodeUTF8((const char *)token_str, token_buffer->n - 1, "strict");
|
||||
|
||||
if (result == NULL) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"Error decoding token");
|
||||
char_array_destroy(token_buffer);
|
||||
goto exit_decref_str;
|
||||
}
|
||||
|
||||
char_array_destroy(token_buffer);
|
||||
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str);
|
||||
#endif
|
||||
Py_XDECREF(unistr);
|
||||
|
||||
return result;
|
||||
|
||||
exit_decref_str:
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str);
|
||||
#endif
|
||||
exit_decref_unistr:
|
||||
Py_XDECREF(unistr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static PyMethodDef normalize_methods[] = {
|
||||
{"normalize_string_utf8", (PyCFunction)py_normalize_string_utf8, METH_VARARGS, "normalize_string_utf8(input, options)"},
|
||||
{"normalize_string_latin", (PyCFunction)py_normalize_string_latin, METH_VARARGS, "normalize_string_latin(input, options)"},
|
||||
{"normalize_token", (PyCFunction)py_normalize_token, METH_VARARGS, "normalize_token(input, options)"},
|
||||
{NULL, NULL},
|
||||
};
|
||||
|
||||
|
||||
|
||||
#ifdef IS_PY3K
|
||||
|
||||
static int normalize_traverse(PyObject *m, visitproc visit, void *arg) {
|
||||
Py_VISIT(GETSTATE(m)->error);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int normalize_clear(PyObject *m) {
|
||||
Py_CLEAR(GETSTATE(m)->error);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static struct PyModuleDef module_def = {
|
||||
PyModuleDef_HEAD_INIT,
|
||||
"_normalize",
|
||||
NULL,
|
||||
sizeof(struct module_state),
|
||||
normalize_methods,
|
||||
NULL,
|
||||
normalize_traverse,
|
||||
normalize_clear,
|
||||
NULL
|
||||
};
|
||||
|
||||
#define INITERROR return NULL
|
||||
|
||||
PyObject *
|
||||
PyInit_normalize(void) {
|
||||
#else
|
||||
#define INITERROR return
|
||||
|
||||
void
|
||||
init_normalize(void) {
|
||||
#endif
|
||||
|
||||
#ifdef IS_PY3K
|
||||
PyObject *module = PyModule_Create(&module_def);
|
||||
#else
|
||||
PyObject *module = Py_InitModule("_normalize", normalize_methods);
|
||||
#endif
|
||||
|
||||
if (module == NULL)
|
||||
INITERROR;
|
||||
struct module_state *st = GETSTATE(module);
|
||||
|
||||
st->error = PyErr_NewException("_normalize.Error", NULL, NULL);
|
||||
if (st->error == NULL) {
|
||||
Py_DECREF(module);
|
||||
INITERROR;
|
||||
}
|
||||
|
||||
if (!transliteration_module_setup(NULL)) {
|
||||
PyErr_SetString(PyExc_RuntimeError,
|
||||
"Could not load transliterate module");
|
||||
Py_DECREF(module);
|
||||
INITERROR;
|
||||
}
|
||||
|
||||
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_LATIN_ASCII", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LATIN_ASCII));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_TRANSLITERATE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRANSLITERATE));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_STRIP_ACCENTS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_STRIP_ACCENTS));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_DECOMPOSE));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LOWERCASE));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRIM));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_REPLACE_HYPHENS));
|
||||
|
||||
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_HYPHENS));
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_HYPHENS));
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_FINAL_PERIOD", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_FINAL_PERIOD));
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS));
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES));
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE));
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC));
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_DIGITS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_DIGITS));
|
||||
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
return module;
|
||||
#endif
|
||||
}
|
||||
@@ -1,164 +0,0 @@
|
||||
#include <Python.h>
|
||||
|
||||
#include "src/scanner.h"
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
#define IS_PY3K
|
||||
#endif
|
||||
|
||||
struct module_state {
|
||||
PyObject *error;
|
||||
};
|
||||
|
||||
|
||||
#ifdef IS_PY3K
|
||||
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
|
||||
#else
|
||||
#define GETSTATE(m) (&_state)
|
||||
static struct module_state _state;
|
||||
#endif
|
||||
|
||||
|
||||
static PyObject *py_tokenize(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *arg1;
|
||||
if (!PyArg_ParseTuple(args, "O:tokenize", &arg1)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyObject *unistr = PyUnicode_FromObject(arg1);
|
||||
if (unistr == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Parameter could not be converted to unicode in scanner");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef IS_PY3K
|
||||
// Python 3 encoding, supported by Python 3.3+
|
||||
|
||||
char *input = PyUnicode_AsUTF8(unistr);
|
||||
|
||||
#else
|
||||
// Python 2 encoding
|
||||
|
||||
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
|
||||
if (str == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Parameter could not be utf-8 encoded");
|
||||
goto error_decref_unistr;
|
||||
}
|
||||
|
||||
char *input = PyBytes_AsString(str);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
if (input == NULL) {
|
||||
goto error_decref_str;
|
||||
}
|
||||
|
||||
token_array *tokens = tokenize(input);
|
||||
if (tokens == NULL) {
|
||||
goto error_decref_str;
|
||||
}
|
||||
|
||||
PyObject *result = PyTuple_New(tokens->n);
|
||||
if (!result) {
|
||||
token_array_destroy(tokens);
|
||||
goto error_decref_str;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyObject *tuple;
|
||||
|
||||
token_t token;
|
||||
for (size_t i = 0; i < tokens->n; i++) {
|
||||
token = tokens->a[i];
|
||||
tuple = Py_BuildValue("III", token.offset, token.len, token.type);
|
||||
if (PyTuple_SetItem(result, i, tuple) < 0) {
|
||||
token_array_destroy(tokens);
|
||||
goto error_decref_str;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str);
|
||||
#endif
|
||||
Py_XDECREF(unistr);
|
||||
|
||||
token_array_destroy(tokens);
|
||||
|
||||
return result;
|
||||
|
||||
error_decref_str:
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str);
|
||||
#endif
|
||||
error_decref_unistr:
|
||||
Py_XDECREF(unistr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static PyMethodDef tokenize_methods[] = {
|
||||
{"tokenize", (PyCFunction)py_tokenize, METH_VARARGS, "tokenize(text)"},
|
||||
{NULL, NULL},
|
||||
};
|
||||
|
||||
|
||||
|
||||
#ifdef IS_PY3K
|
||||
|
||||
static int tokenize_traverse(PyObject *m, visitproc visit, void *arg) {
|
||||
Py_VISIT(GETSTATE(m)->error);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tokenize_clear(PyObject *m) {
|
||||
Py_CLEAR(GETSTATE(m)->error);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static struct PyModuleDef module_def = {
|
||||
PyModuleDef_HEAD_INIT,
|
||||
"_tokenize",
|
||||
NULL,
|
||||
sizeof(struct module_state),
|
||||
tokenize_methods,
|
||||
NULL,
|
||||
tokenize_traverse,
|
||||
tokenize_clear,
|
||||
NULL
|
||||
};
|
||||
|
||||
#define INITERROR return NULL
|
||||
|
||||
PyObject *
|
||||
PyInit_tokenize(void) {
|
||||
#else
|
||||
#define INITERROR return
|
||||
|
||||
void
|
||||
init_tokenize(void) {
|
||||
#endif
|
||||
|
||||
#ifdef IS_PY3K
|
||||
PyObject *module = PyModule_Create(&module_def);
|
||||
#else
|
||||
PyObject *module = Py_InitModule("_tokenize", tokenize_methods);
|
||||
#endif
|
||||
|
||||
if (module == NULL)
|
||||
INITERROR;
|
||||
struct module_state *st = GETSTATE(module);
|
||||
|
||||
st->error = PyErr_NewException("_tokenize.Error", NULL, NULL);
|
||||
if (st->error == NULL) {
|
||||
Py_DECREF(module);
|
||||
INITERROR;
|
||||
}
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
return module;
|
||||
#endif
|
||||
}
|
||||
@@ -1,68 +0,0 @@
|
||||
from postal.utils.enum import Enum, EnumValue
|
||||
|
||||
|
||||
class token_types(Enum):
|
||||
# Word types
|
||||
WORD = EnumValue(1)
|
||||
ABBREVIATION = EnumValue(2)
|
||||
IDEOGRAPHIC_CHAR = EnumValue(3)
|
||||
HANGUL_SYLLABLE = EnumValue(4)
|
||||
ACRONYM = EnumValue(5)
|
||||
|
||||
# Special tokens
|
||||
EMAIL = EnumValue(20)
|
||||
URL = EnumValue(21)
|
||||
US_PHONE = EnumValue(22)
|
||||
INTL_PHONE = EnumValue(23)
|
||||
|
||||
# Numbers and numeric types
|
||||
NUMERIC = EnumValue(50)
|
||||
ORDINAL = EnumValue(51)
|
||||
ROMAN_NUMERAL = EnumValue(52)
|
||||
IDEOGRAPHIC_NUMBER = EnumValue(53)
|
||||
|
||||
# Punctuation types, may separate a phrase
|
||||
PERIOD = EnumValue(100)
|
||||
EXCLAMATION = EnumValue(101)
|
||||
QUESTION_MARK = EnumValue(102)
|
||||
COMMA = EnumValue(103)
|
||||
COLON = EnumValue(104)
|
||||
SEMICOLON = EnumValue(105)
|
||||
PLUS = EnumValue(106)
|
||||
AMPERSAND = EnumValue(107)
|
||||
AT_SIGN = EnumValue(108)
|
||||
POUND = EnumValue(109)
|
||||
ELLIPSIS = EnumValue(110)
|
||||
DASH = EnumValue(111)
|
||||
BREAKING_DASH = EnumValue(112)
|
||||
HYPHEN = EnumValue(113)
|
||||
PUNCT_OPEN = EnumValue(114)
|
||||
PUNCT_CLOSE = EnumValue(115)
|
||||
DOUBLE_QUOTE = EnumValue(119)
|
||||
SINGLE_QUOTE = EnumValue(120)
|
||||
OPEN_QUOTE = EnumValue(121)
|
||||
CLOSE_QUOTE = EnumValue(122)
|
||||
SLASH = EnumValue(124)
|
||||
BACKSLASH = EnumValue(125)
|
||||
GREATER_THAN = EnumValue(126)
|
||||
LESS_THAN = EnumValue(127)
|
||||
|
||||
# Non-letters and whitespace
|
||||
OTHER = EnumValue(200)
|
||||
WHITESPACE = EnumValue(300)
|
||||
NEWLINE = EnumValue(301)
|
||||
|
||||
WORD_TOKEN_TYPES = set([
|
||||
WORD,
|
||||
ABBREVIATION,
|
||||
IDEOGRAPHIC_CHAR,
|
||||
HANGUL_SYLLABLE,
|
||||
ACRONYM
|
||||
])
|
||||
|
||||
NUMERIC_TOKEN_TYPES = set([
|
||||
NUMERIC,
|
||||
ORDINAL,
|
||||
ROMAN_NUMERAL,
|
||||
IDEOGRAPHIC_NUMBER,
|
||||
])
|
||||
@@ -1,14 +0,0 @@
|
||||
from postal.text.encoding import safe_encode, safe_decode
|
||||
from postal.text import _tokenize
|
||||
from postal.text.token_types import token_types
|
||||
|
||||
|
||||
def tokenize_raw(s):
|
||||
return _tokenize.tokenize(safe_decode(s))
|
||||
|
||||
|
||||
def tokenize(s):
|
||||
u = safe_decode(s)
|
||||
s = safe_encode(s)
|
||||
return [(safe_decode(s[start:start + length]), token_types.from_id(token_type))
|
||||
for start, length, token_type in _tokenize.tokenize(u)]
|
||||
@@ -1,62 +0,0 @@
|
||||
|
||||
class EnumValue(object):
|
||||
def __init__(self, value, name=None):
|
||||
self.value = value
|
||||
self.name = name
|
||||
|
||||
def __hash__(self):
|
||||
return self.value
|
||||
|
||||
def __cmp__(self, other):
|
||||
if isinstance(other, EnumValue):
|
||||
return self.value.__cmp__(other.value)
|
||||
else:
|
||||
return self.value.__cmp__(other)
|
||||
|
||||
def __unicode__(self):
|
||||
return self.name
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
def __repr__(self):
|
||||
return self.name
|
||||
|
||||
|
||||
class EnumMeta(type):
|
||||
def __init__(self, name, bases, dict_):
|
||||
self.registry = self.registry.copy()
|
||||
self.name_registry = self.name_registry.copy()
|
||||
for k, v in dict_.iteritems():
|
||||
if isinstance(v, EnumValue) and v not in self.registry:
|
||||
if v.name is None:
|
||||
v.name = k
|
||||
self.registry[v.value] = v
|
||||
self.name_registry[v.name] = v
|
||||
return super(EnumMeta, self).__init__(name, bases, dict_)
|
||||
|
||||
def __iter__(self):
|
||||
return self.registry.itervalues()
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.registry[key]
|
||||
|
||||
|
||||
class Enum(object):
|
||||
__metaclass__ = EnumMeta
|
||||
registry = {}
|
||||
name_registry = {}
|
||||
|
||||
@classmethod
|
||||
def from_id(cls, value):
|
||||
try:
|
||||
return cls.registry[value]
|
||||
except KeyError:
|
||||
raise ValueError('Invalid value for {}: {}'.format(cls.__name__, value))
|
||||
|
||||
@classmethod
|
||||
def from_string(cls, name):
|
||||
try:
|
||||
return cls.name_registry[name]
|
||||
except KeyError:
|
||||
raise ValueError('Invalid name for {}: {}'.format(cls.__name__, name))
|
||||
78
setup.py
78
setup.py
@@ -1,78 +0,0 @@
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from setuptools import setup, Extension, Command, find_packages
|
||||
from setuptools.command.build_py import build_py
|
||||
from setuptools.command.build_ext import build_ext
|
||||
from setuptools.command.install import install
|
||||
from distutils.errors import DistutilsArgError
|
||||
|
||||
SRC_DIR = 'src'
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
|
||||
def main():
|
||||
setup(
|
||||
name='pypostal',
|
||||
version='0.2',
|
||||
install_requires=[
|
||||
'six',
|
||||
],
|
||||
ext_modules=[
|
||||
Extension('postal.text._tokenize',
|
||||
sources=[os.path.join(SRC_DIR, f)
|
||||
for f in ('scanner.c',
|
||||
'string_utils.c',
|
||||
'tokens.c',
|
||||
'utf8proc/utf8proc.c',
|
||||
)
|
||||
] + ['python/postal/text/pytokenize.c'],
|
||||
include_dirs=[this_dir],
|
||||
extra_compile_args=['-O0', '-std=c99',
|
||||
'-Wno-unused-function'],
|
||||
),
|
||||
Extension('postal.text._normalize',
|
||||
sources=[os.path.join(SRC_DIR, f)
|
||||
for f in ('normalize.c',
|
||||
'string_utils.c',
|
||||
'utf8proc/utf8proc.c',
|
||||
'tokens.c',
|
||||
'unicode_scripts.c',
|
||||
'transliterate.c',
|
||||
'file_utils.c',
|
||||
'trie.c',
|
||||
'trie_search.c',)
|
||||
] + ['python/postal/text/pynormalize.c'],
|
||||
include_dirs=[this_dir],
|
||||
extra_compile_args=['-std=c99', '-DHAVE_CONFIG_H',
|
||||
'-Wno-unused-function'],
|
||||
),
|
||||
Extension('postal._expand',
|
||||
sources=['python/postal/pyexpand.c'],
|
||||
libraries=['postal'],
|
||||
extra_compile_args=['-std=c99',
|
||||
'-Wno-unused-function'],
|
||||
),
|
||||
Extension('postal._parser',
|
||||
sources=['python/postal/pyparser.c'],
|
||||
libraries=['postal'],
|
||||
extra_compile_args=['-std=c99',
|
||||
'-Wno-unused-function'],
|
||||
),
|
||||
],
|
||||
packages=find_packages('python'),
|
||||
package_dir={'': 'python'},
|
||||
include_package_data=True,
|
||||
zip_safe=False,
|
||||
url='http://mapzen.com',
|
||||
description='Fast address standardization and deduplication',
|
||||
license='MIT License',
|
||||
maintainer='mapzen.com',
|
||||
maintainer_email='pelias@mapzen.com'
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user