diff --git a/python/postal/parser.py b/python/postal/parser.py new file mode 100644 index 00000000..b6e96891 --- /dev/null +++ b/python/postal/parser.py @@ -0,0 +1,14 @@ +import _parser +from postal.text.encoding import safe_decode + +DEFAULT_LANGUAGES = ('en',) + + +def parse_address(address, language=None, country=None): + ''' + @param address: the address as either Unicode or a UTF-8 encoded string + @param language (optional): language code + @param country (optional): country code + ''' + address = safe_decode(address, 'utf-8') + return _parser.parse_address(address, language=language, country=country) diff --git a/python/postal/pyparser.c b/python/postal/pyparser.c new file mode 100644 index 00000000..24ee28ee --- /dev/null +++ b/python/postal/pyparser.c @@ -0,0 +1,299 @@ +#include +#include + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#endif + +struct module_state { + PyObject *error; +}; + + +#ifdef IS_PY3K + #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) +#else + #define GETSTATE(m) (&_state) + static struct module_state _state; +#endif + + +static PyObject *py_parse_address(PyObject *self, PyObject *args, PyObject *keywords) { + PyObject *arg_input; + PyObject *arg_language = Py_None; + PyObject *arg_country = Py_None; + + PyObject *result = NULL; + + static char *kwlist[] = {"address", + "language", + "country", + NULL + }; + + + if (!PyArg_ParseTupleAndKeywords(args, keywords, + "O|OO:pyparser", kwlist, + &arg_input, &arg_language, + &arg_country + )) { + return 0; + } + + PyObject *unistr_input = PyUnicode_FromObject(arg_input); + if (unistr_input == NULL) { + PyErr_SetString(PyExc_TypeError, + "Input could not be converted to unicode"); + return 0; + } + + char *input = NULL; + + #ifdef IS_PY3K + // Python 3 encoding, supported by Python 3.3+ + + input = PyUnicode_AsUTF8(unistr_input); + + #else + // Python 2 encoding + + PyObject *str_input = PyUnicode_AsEncodedString(unistr_input, "utf-8", "strict"); + if (str_input == NULL) { + PyErr_SetString(PyExc_TypeError, + "Input could not be utf-8 encoded"); + goto exit_decref_input_unistr; + } + + input = PyBytes_AsString(str_input); + #endif + + if (input == NULL) { + goto exit_decref_input_str; + } + + char *language = NULL; + + PyObject *unistr_language = Py_None; + PyObject *str_language = Py_None; + + if (arg_language != Py_None) { + unistr_language = PyUnicode_FromObject(arg_language); + if (unistr_language == NULL) { + PyErr_SetString(PyExc_TypeError, + "Language could not be converted to unicode"); + } + + #ifdef IS_PY3K + // Python 3 encoding, supported by Python 3.3+ + + language = PyUnicode_AsUTF8(unistr_language); + + #else + // Python 2 encoding + + PyObject *str_language = PyUnicode_AsEncodedString(unistr_language, "utf-8", "strict"); + if (str_language == NULL) { + PyErr_SetString(PyExc_TypeError, + "Language could not be utf-8 encoded"); + goto exit_decref_language_unistr; + } + + language = PyBytes_AsString(str_language); + #endif + + if (language == NULL) { + goto exit_decref_language_str; + } + } + + char *country = NULL; + PyObject *unistr_country = Py_None; + PyObject *str_country = Py_None; + + if (arg_country != Py_None) { + unistr_country = PyUnicode_FromObject(arg_country); + if (unistr_country == NULL) { + PyErr_SetString(PyExc_TypeError, + "Country could not be converted to unicode"); + } + + #ifdef IS_PY3K + // Python 3 encoding, supported by Python 3.3+ + + country = PyUnicode_AsUTF8(unistr_country); + + #else + // Python 2 encoding + + PyObject *str_country = PyUnicode_AsEncodedString(unistr_country, "utf-8", "strict"); + if (str_country == NULL) { + PyErr_SetString(PyExc_TypeError, + "Country could not be utf-8 encoded"); + goto exit_decref_country_unistr; + } + + country = PyBytes_AsString(str_country); + #endif + + if (country == NULL) { + goto exit_decref_country_str; + } + } + + address_parser_options_t options = LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS; + options.language = language; + options.country = country; + + address_parser_response_t *parsed = parse_address(input, options); + if (parsed == NULL) { + goto exit_decref_country_str; + } + + result = PyList_New((Py_ssize_t)parsed->num_components); + if (!result) { + goto exit_destroy_response; + } + + for (int i = 0; i < parsed->num_components; i++) { + char *component = parsed->components[i]; + char *label = parsed->labels[i]; + PyObject *component_unicode = PyUnicode_DecodeUTF8((const char *)component, strlen(component), "strict"); + if (component_unicode == NULL) { + Py_DECREF(result); + goto exit_destroy_response; + } + + PyObject *label_unicode = PyUnicode_DecodeUTF8((const char *)label, strlen(label), "strict"); + if (label_unicode == NULL) { + Py_DECREF(component_unicode); + Py_DECREF(result); + goto exit_destroy_response; + } + PyObject *tuple = Py_BuildValue("(OO)", component_unicode, label_unicode); + if (tuple == NULL) { + Py_DECREF(component_unicode); + Py_DECREF(label_unicode); + goto exit_destroy_response; + } + + // Note: PyList_SetItem steals a reference, so don't worry about DECREF + PyList_SetItem(result, (Py_ssize_t)i, tuple); + + Py_DECREF(component_unicode); + Py_DECREF(label_unicode); + } + +exit_destroy_response: + address_parser_response_destroy(parsed); +exit_decref_country_str: + #ifndef IS_PY3K + if (str_country != Py_None) { + Py_XDECREF(str_country); + } + #endif +exit_decref_country_unistr: + if (unistr_country != Py_None) { + Py_XDECREF(unistr_country); + } +exit_decref_language_str: + #ifndef IS_PY3K + if (str_language != Py_None) { + Py_XDECREF(str_language); + } + #endif +exit_decref_language_unistr: + if (unistr_language != Py_None) { + Py_XDECREF(unistr_language); + } +exit_decref_input_str: + #ifndef IS_PY3K + Py_XDECREF(str_input); + #endif +exit_decref_input_unistr: + Py_XDECREF(unistr_input); + + return result; +} + +static PyMethodDef parser_methods[] = { + {"parse_address", (PyCFunction)py_parse_address, METH_VARARGS | METH_KEYWORDS, "parse_address(text, language, country)"}, + {NULL, NULL}, +}; + + + +#ifdef IS_PY3K + +static int parser_traverse(PyObject *m, visitproc visit, void *arg) { + Py_VISIT(GETSTATE(m)->error); + return 0; +} + +static int parser_clear(PyObject *m) { + Py_CLEAR(GETSTATE(m)->error); + libpostal_teardown(); + libpostal_teardown_parser(); + return 0; +} + +static struct PyModuleDef module_def = { + PyModuleDef_HEAD_INIT, + "_parser", + NULL, + sizeof(struct module_state), + parser_methods, + NULL, + parser_traverse, + parser_clear, + NULL +}; + +#define INITERROR return NULL + +PyObject * +PyInit_parser(void) { +#else + +#define INITERROR return + +void cleanup_libpostal(void) { + libpostal_teardown(); + libpostal_teardown_parser(); +} + +void +init_parser(void) { +#endif + +#ifdef IS_PY3K + PyObject *module = PyModule_Create(&module_def); +#else + PyObject *module = Py_InitModule("_parser", parser_methods); +#endif + + if (module == NULL) { + INITERROR; + } + struct module_state *st = GETSTATE(module); + + st->error = PyErr_NewException("_parser.Error", NULL, NULL); + if (st->error == NULL) { + Py_DECREF(module); + INITERROR; + } + + if (!libpostal_setup() || !libpostal_setup_parser()) { + PyErr_SetString(PyExc_TypeError, + "Error loading libpostal data"); + } + +#ifndef IS_PY3K + Py_AtExit(&cleanup_libpostal); +#endif + + +#ifdef IS_PY3K + return module; +#endif +} +