From 7af0e2d967e801525aff19ff5a290e09a98f4fa4 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 14 Dec 2015 18:18:16 -0500 Subject: [PATCH] [python] Adding Python bindings to the expand API --- python/postal/pyexpand.c | 340 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 340 insertions(+) create mode 100644 python/postal/pyexpand.c diff --git a/python/postal/pyexpand.c b/python/postal/pyexpand.c new file mode 100644 index 00000000..5188e72b --- /dev/null +++ b/python/postal/pyexpand.c @@ -0,0 +1,340 @@ +#include +#include + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#endif + +struct module_state { + PyObject *error; +}; + + +#ifdef IS_PY3K + #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) +#else + #define GETSTATE(m) (&_state) + static struct module_state _state; +#endif + + +static PyObject *py_expand(PyObject *self, PyObject *args, PyObject *keywords) { + PyObject *arg_input; + PyObject *arg_languages; + normalize_options_t options = LIBPOSTAL_DEFAULT_OPTIONS; + + PyObject *result = NULL; + + static char *kwlist[] = {"address", + "languages", + "address_components", + "latin_ascii", + "transliterate", + "strip_accents", + "decompose", + "lowercase", + "trim_string", + "replace_word_hyphens", + "delete_word_hyphens", + "replace_numeric_hyphens", + "delete_numeric_hyphens", + "split_alpha_from_numeric", + "delete_final_periods", + "delete_acronym_periods", + "drop_english_possessives", + "delete_apostrophes", + "expand_numex", + "roman_numerals", + NULL + }; + + uint32_t address_components = options.address_components; + uint32_t latin_ascii = options.latin_ascii; + uint32_t transliterate = options.transliterate; + uint32_t strip_accents = options.strip_accents; + uint32_t decompose = options.decompose; + uint32_t lowercase = options.lowercase; + uint32_t trim_string = options.trim_string; + uint32_t replace_word_hyphens = options.replace_word_hyphens; + uint32_t delete_word_hyphens = options.delete_word_hyphens; + uint32_t replace_numeric_hyphens = options.replace_numeric_hyphens; + uint32_t delete_numeric_hyphens = options.delete_numeric_hyphens; + uint32_t split_alpha_from_numeric = options.split_alpha_from_numeric; + uint32_t delete_final_periods = options.delete_final_periods; + uint32_t delete_acronym_periods = options.delete_acronym_periods; + uint32_t expand_numex = options.expand_numex; + uint32_t roman_numerals = options.roman_numerals; + + if (!PyArg_ParseTupleAndKeywords(args, keywords, + "O|OHIIIIIIIIIIIIIIIIII:pyexpand", kwlist, + &arg_input, &arg_languages, + &address_components, + &latin_ascii, + &transliterate, + &strip_accents, + &decompose, + &lowercase, + &trim_string, + &replace_word_hyphens, + &delete_word_hyphens, + &replace_numeric_hyphens, + &delete_numeric_hyphens, + &split_alpha_from_numeric, + &delete_final_periods, + &delete_acronym_periods, + &expand_numex, + &roman_numerals + )) { + return 0; + } + + + options.address_components = address_components; + options.latin_ascii = latin_ascii; + options.transliterate = transliterate; + options.strip_accents = strip_accents; + options.decompose = decompose; + options.lowercase = lowercase; + options.trim_string = trim_string; + options.replace_word_hyphens = replace_word_hyphens; + options.delete_word_hyphens = delete_word_hyphens; + options.replace_numeric_hyphens = replace_numeric_hyphens; + options.delete_numeric_hyphens = delete_numeric_hyphens; + options.split_alpha_from_numeric = split_alpha_from_numeric; + options.delete_final_periods = delete_final_periods; + options.delete_acronym_periods = delete_acronym_periods; + options.expand_numex = expand_numex; + options.roman_numerals = roman_numerals; + + PyObject *unistr_input = PyUnicode_FromObject(arg_input); + if (unistr_input == NULL) { + PyErr_SetString(PyExc_TypeError, + "Input could not be converted to unicode"); + return 0; + } + + char *input = NULL; + + #ifdef IS_PY3K + // Python 3 encoding, supported by Python 3.3+ + + input = PyUnicode_AsUTF8(unistr_input); + + #else + // Python 2 encoding + + PyObject *str_input = PyUnicode_AsEncodedString(unistr_input, "utf-8", "strict"); + if (str_input == NULL) { + PyErr_SetString(PyExc_TypeError, + "Input could not be utf-8 encoded"); + return 0; + } + + input = PyBytes_AsString(str_input); + #endif + + if (input == NULL) { + goto exit_decref_str; + } + + char **languages = NULL; + + if (PySequence_Check(arg_languages)) { + PyObject *seq = PySequence_Fast(arg_languages, "Expected a sequence"); + Py_ssize_t len_languages = PySequence_Length(arg_languages); + size_t num_languages = 0; + + if (len_languages > 0) { + languages = malloc(len_languages * sizeof(char *)); + if (languages == NULL) { + goto exit_decref_str; + } + + char *language = NULL; + + for (int i = 0; i < len_languages; i++) { + PyObject *item = PySequence_Fast_GET_ITEM(seq, i); + + language = NULL; + + #if IS_PY3K + + if (PyBytes_Check(item)) { + language = PyBytes_AsString(item); + } + + #else + + if (PyString_Check(item)) { + language = PyString_AsString(item); + } + + #endif + + if (language != NULL && item != Py_None) { + if (strlen(language) >= MAX_LANGUAGE_LEN) { + PyErr_SetString(PyExc_TypeError, "language was longer than a language code"); + free(languages); + Py_DECREF(seq); + goto exit_decref_str; + } + languages[num_languages] = strdup(language); + num_languages++; + } + + } + + if (num_languages > 0) { + options.languages = languages; + } + + } + options.num_languages = num_languages; + + Py_DECREF(seq); + } + + if (languages == NULL) { + PyErr_SetString(PyExc_TypeError, "Must specify languages=[list of language codes] to expand_address"); + goto exit_decref_str; + } + + + size_t num_expansions = 0; + char **expansions = expand_address(input, options, &num_expansions); + + if (languages != NULL) { + free(languages); + } + + if (expansions == NULL) { + goto exit_decref_str; + } + + result = PyList_New((Py_ssize_t)num_expansions); + if (!result) { + goto exit_free_expansions; + } + + for (int i = 0; i < num_expansions; i++) { + char *expansion = expansions[i]; + PyObject *u = PyUnicode_DecodeUTF8((const char *)expansion, strlen(expansion), "strict"); + if (u == NULL) { + Py_DECREF(result); + goto exit_free_expansions; + } + // Note: PyList_SetItem steals a reference, so don't worry about DECREF + PyList_SetItem(result, (Py_ssize_t)i, u); + } + +exit_free_expansions: + for (int i = 0; i < num_expansions; i++) { + free(expansions[i]); + } + free(expansions); +exit_decref_str: + #ifndef IS_PY3K + Py_XDECREF(str_input); + #endif +exit_decref_unistr: + Py_XDECREF(unistr_input); + + return result; +} + +static PyMethodDef expand_methods[] = { + {"expand_address", (PyCFunction)py_expand, METH_VARARGS | METH_KEYWORDS, "expand_address(text, **kw)"}, + {NULL, NULL}, +}; + + + +#ifdef IS_PY3K + +static int expand_traverse(PyObject *m, visitproc visit, void *arg) { + Py_VISIT(GETSTATE(m)->error); + return 0; +} + +static int expand_clear(PyObject *m) { + Py_CLEAR(GETSTATE(m)->error); + libpostal_teardown(); + return 0; +} + +static struct PyModuleDef module_def = { + PyModuleDef_HEAD_INIT, + "_expand", + NULL, + sizeof(struct module_state), + expand_methods, + NULL, + expand_traverse, + expand_clear, + NULL +}; + +#define INITERROR return NULL + +PyObject * +PyInit_expand(void) { + +#else + +#define INITERROR return + +void cleanup_libpostal(void) { + libpostal_teardown(); +} + +void +init_expand(void) { + +#endif + +#ifdef IS_PY3K + PyObject *module = PyModule_Create(&module_def); +#else + PyObject *module = Py_InitModule("_expand", expand_methods); +#endif + + if (module == NULL) { + INITERROR; + } + struct module_state *st = GETSTATE(module); + + st->error = PyErr_NewException("_expand.Error", NULL, NULL); + if (st->error == NULL) { + Py_DECREF(module); + INITERROR; + } + + if (!libpostal_setup()) { + PyErr_SetString(PyExc_TypeError, + "Error loading libpostal"); + } + + PyModule_AddIntConstant(module, "ADDRESS_ANY", ADDRESS_ANY); + PyModule_AddIntConstant(module, "ADDRESS_NAME", ADDRESS_NAME); + PyModule_AddIntConstant(module, "ADDRESS_HOUSE_NUMBER", ADDRESS_HOUSE_NUMBER); + PyModule_AddIntConstant(module, "ADDRESS_STREET", ADDRESS_STREET); + PyModule_AddIntConstant(module, "ADDRESS_UNIT", ADDRESS_UNIT); + PyModule_AddIntConstant(module, "ADDRESS_LOCALITY", ADDRESS_LOCALITY); + PyModule_AddIntConstant(module, "ADDRESS_ADMIN1", ADDRESS_ADMIN1); + PyModule_AddIntConstant(module, "ADDRESS_ADMIN2", ADDRESS_ADMIN2); + PyModule_AddIntConstant(module, "ADDRESS_ADMIN3", ADDRESS_ADMIN3); + PyModule_AddIntConstant(module, "ADDRESS_ADMIN4", ADDRESS_ADMIN4); + PyModule_AddIntConstant(module, "ADDRESS_ADMIN_OTHER", ADDRESS_ADMIN_OTHER); + PyModule_AddIntConstant(module, "ADDRESS_COUNTRY", ADDRESS_COUNTRY); + PyModule_AddIntConstant(module, "ADDRESS_NEIGHBORHOOD", ADDRESS_NEIGHBORHOOD); + PyModule_AddIntConstant(module, "ADDRESS_ALL", ADDRESS_ALL); + +#ifndef IS_PY3K + Py_AtExit(&cleanup_libpostal); +#endif + +#if IS_PY3K + return module; +#endif +} +