Files
libpostal/python/postal/pyexpand.c
2015-12-15 02:56:02 -05:00

341 lines
10 KiB
C

#include <Python.h>
#include <libpostal/libpostal.h>
#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif
struct module_state {
PyObject *error;
};
#ifdef IS_PY3K
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
#else
#define GETSTATE(m) (&_state)
static struct module_state _state;
#endif
static PyObject *py_expand(PyObject *self, PyObject *args, PyObject *keywords) {
PyObject *arg_input;
PyObject *arg_languages;
normalize_options_t options = LIBPOSTAL_DEFAULT_OPTIONS;
PyObject *result = NULL;
static char *kwlist[] = {"address",
"languages",
"address_components",
"latin_ascii",
"transliterate",
"strip_accents",
"decompose",
"lowercase",
"trim_string",
"replace_word_hyphens",
"delete_word_hyphens",
"replace_numeric_hyphens",
"delete_numeric_hyphens",
"split_alpha_from_numeric",
"delete_final_periods",
"delete_acronym_periods",
"drop_english_possessives",
"delete_apostrophes",
"expand_numex",
"roman_numerals",
NULL
};
uint32_t address_components = options.address_components;
uint32_t latin_ascii = options.latin_ascii;
uint32_t transliterate = options.transliterate;
uint32_t strip_accents = options.strip_accents;
uint32_t decompose = options.decompose;
uint32_t lowercase = options.lowercase;
uint32_t trim_string = options.trim_string;
uint32_t replace_word_hyphens = options.replace_word_hyphens;
uint32_t delete_word_hyphens = options.delete_word_hyphens;
uint32_t replace_numeric_hyphens = options.replace_numeric_hyphens;
uint32_t delete_numeric_hyphens = options.delete_numeric_hyphens;
uint32_t split_alpha_from_numeric = options.split_alpha_from_numeric;
uint32_t delete_final_periods = options.delete_final_periods;
uint32_t delete_acronym_periods = options.delete_acronym_periods;
uint32_t expand_numex = options.expand_numex;
uint32_t roman_numerals = options.roman_numerals;
if (!PyArg_ParseTupleAndKeywords(args, keywords,
"O|OHIIIIIIIIIIIIIIIIII:pyexpand", kwlist,
&arg_input, &arg_languages,
&address_components,
&latin_ascii,
&transliterate,
&strip_accents,
&decompose,
&lowercase,
&trim_string,
&replace_word_hyphens,
&delete_word_hyphens,
&replace_numeric_hyphens,
&delete_numeric_hyphens,
&split_alpha_from_numeric,
&delete_final_periods,
&delete_acronym_periods,
&expand_numex,
&roman_numerals
)) {
return 0;
}
options.address_components = address_components;
options.latin_ascii = latin_ascii;
options.transliterate = transliterate;
options.strip_accents = strip_accents;
options.decompose = decompose;
options.lowercase = lowercase;
options.trim_string = trim_string;
options.replace_word_hyphens = replace_word_hyphens;
options.delete_word_hyphens = delete_word_hyphens;
options.replace_numeric_hyphens = replace_numeric_hyphens;
options.delete_numeric_hyphens = delete_numeric_hyphens;
options.split_alpha_from_numeric = split_alpha_from_numeric;
options.delete_final_periods = delete_final_periods;
options.delete_acronym_periods = delete_acronym_periods;
options.expand_numex = expand_numex;
options.roman_numerals = roman_numerals;
PyObject *unistr_input = PyUnicode_FromObject(arg_input);
if (unistr_input == NULL) {
PyErr_SetString(PyExc_TypeError,
"Input could not be converted to unicode");
return 0;
}
char *input = NULL;
#ifdef IS_PY3K
// Python 3 encoding, supported by Python 3.3+
input = PyUnicode_AsUTF8(unistr_input);
#else
// Python 2 encoding
PyObject *str_input = PyUnicode_AsEncodedString(unistr_input, "utf-8", "strict");
if (str_input == NULL) {
PyErr_SetString(PyExc_TypeError,
"Input could not be utf-8 encoded");
return 0;
}
input = PyBytes_AsString(str_input);
#endif
if (input == NULL) {
goto exit_decref_str;
}
char **languages = NULL;
if (PySequence_Check(arg_languages)) {
PyObject *seq = PySequence_Fast(arg_languages, "Expected a sequence");
Py_ssize_t len_languages = PySequence_Length(arg_languages);
size_t num_languages = 0;
if (len_languages > 0) {
languages = malloc(len_languages * sizeof(char *));
if (languages == NULL) {
goto exit_decref_str;
}
char *language = NULL;
for (int i = 0; i < len_languages; i++) {
PyObject *item = PySequence_Fast_GET_ITEM(seq, i);
language = NULL;
#if IS_PY3K
if (PyBytes_Check(item)) {
language = PyBytes_AsString(item);
}
#else
if (PyString_Check(item)) {
language = PyString_AsString(item);
}
#endif
if (language != NULL && item != Py_None) {
if (strlen(language) >= MAX_LANGUAGE_LEN) {
PyErr_SetString(PyExc_TypeError, "language was longer than a language code");
free(languages);
Py_DECREF(seq);
goto exit_decref_str;
}
languages[num_languages] = strdup(language);
num_languages++;
}
}
if (num_languages > 0) {
options.languages = languages;
}
}
options.num_languages = num_languages;
Py_DECREF(seq);
}
if (languages == NULL) {
PyErr_SetString(PyExc_TypeError, "Must specify languages=[list of language codes] to expand_address");
goto exit_decref_str;
}
size_t num_expansions = 0;
char **expansions = expand_address(input, options, &num_expansions);
if (languages != NULL) {
free(languages);
}
if (expansions == NULL) {
goto exit_decref_str;
}
result = PyList_New((Py_ssize_t)num_expansions);
if (!result) {
goto exit_free_expansions;
}
for (int i = 0; i < num_expansions; i++) {
char *expansion = expansions[i];
PyObject *u = PyUnicode_DecodeUTF8((const char *)expansion, strlen(expansion), "strict");
if (u == NULL) {
Py_DECREF(result);
goto exit_free_expansions;
}
// Note: PyList_SetItem steals a reference, so don't worry about DECREF
PyList_SetItem(result, (Py_ssize_t)i, u);
}
exit_free_expansions:
for (int i = 0; i < num_expansions; i++) {
free(expansions[i]);
}
free(expansions);
exit_decref_str:
#ifndef IS_PY3K
Py_XDECREF(str_input);
#endif
exit_decref_unistr:
Py_XDECREF(unistr_input);
return result;
}
static PyMethodDef expand_methods[] = {
{"expand_address", (PyCFunction)py_expand, METH_VARARGS | METH_KEYWORDS, "expand_address(text, **kw)"},
{NULL, NULL},
};
#ifdef IS_PY3K
static int expand_traverse(PyObject *m, visitproc visit, void *arg) {
Py_VISIT(GETSTATE(m)->error);
return 0;
}
static int expand_clear(PyObject *m) {
Py_CLEAR(GETSTATE(m)->error);
libpostal_teardown();
return 0;
}
static struct PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"_expand",
NULL,
sizeof(struct module_state),
expand_methods,
NULL,
expand_traverse,
expand_clear,
NULL
};
#define INITERROR return NULL
PyObject *
PyInit_expand(void) {
#else
#define INITERROR return
void cleanup_libpostal(void) {
libpostal_teardown();
}
void
init_expand(void) {
#endif
#ifdef IS_PY3K
PyObject *module = PyModule_Create(&module_def);
#else
PyObject *module = Py_InitModule("_expand", expand_methods);
#endif
if (module == NULL) {
INITERROR;
}
struct module_state *st = GETSTATE(module);
st->error = PyErr_NewException("_expand.Error", NULL, NULL);
if (st->error == NULL) {
Py_DECREF(module);
INITERROR;
}
if (!libpostal_setup()) {
PyErr_SetString(PyExc_TypeError,
"Error loading libpostal");
}
PyModule_AddIntConstant(module, "ADDRESS_ANY", ADDRESS_ANY);
PyModule_AddIntConstant(module, "ADDRESS_NAME", ADDRESS_NAME);
PyModule_AddIntConstant(module, "ADDRESS_HOUSE_NUMBER", ADDRESS_HOUSE_NUMBER);
PyModule_AddIntConstant(module, "ADDRESS_STREET", ADDRESS_STREET);
PyModule_AddIntConstant(module, "ADDRESS_UNIT", ADDRESS_UNIT);
PyModule_AddIntConstant(module, "ADDRESS_LOCALITY", ADDRESS_LOCALITY);
PyModule_AddIntConstant(module, "ADDRESS_ADMIN1", ADDRESS_ADMIN1);
PyModule_AddIntConstant(module, "ADDRESS_ADMIN2", ADDRESS_ADMIN2);
PyModule_AddIntConstant(module, "ADDRESS_ADMIN3", ADDRESS_ADMIN3);
PyModule_AddIntConstant(module, "ADDRESS_ADMIN4", ADDRESS_ADMIN4);
PyModule_AddIntConstant(module, "ADDRESS_ADMIN_OTHER", ADDRESS_ADMIN_OTHER);
PyModule_AddIntConstant(module, "ADDRESS_COUNTRY", ADDRESS_COUNTRY);
PyModule_AddIntConstant(module, "ADDRESS_NEIGHBORHOOD", ADDRESS_NEIGHBORHOOD);
PyModule_AddIntConstant(module, "ADDRESS_ALL", ADDRESS_ALL);
#ifndef IS_PY3K
Py_AtExit(&cleanup_libpostal);
#endif
#if IS_PY3K
return module;
#endif
}