diff --git a/python/postal/__init__.py b/python/postal/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/python/postal/expand.py b/python/postal/expand.py deleted file mode 100644 index 5586a65a..00000000 --- a/python/postal/expand.py +++ /dev/null @@ -1,16 +0,0 @@ -import _expand -from postal.text.encoding import safe_decode - -DEFAULT_LANGUAGES = ('en',) - - -def expand_address(address, languages=DEFAULT_LANGUAGES, **kw): - ''' - @param address: the address as either Unicode or a UTF-8 encoded string - @param languages: a tuple or list of ISO language code strings (e.g. "en", "fr", "de", etc.) - to use in expansion. Default is English. Until automatic language classification - is ready in libpostal, this parameter is required. - - ''' - address = safe_decode(address, 'utf-8') - return _expand.expand_address(address, languages=languages, **kw) diff --git a/python/postal/parser.py b/python/postal/parser.py deleted file mode 100644 index b6e96891..00000000 --- a/python/postal/parser.py +++ /dev/null @@ -1,14 +0,0 @@ -import _parser -from postal.text.encoding import safe_decode - -DEFAULT_LANGUAGES = ('en',) - - -def parse_address(address, language=None, country=None): - ''' - @param address: the address as either Unicode or a UTF-8 encoded string - @param language (optional): language code - @param country (optional): country code - ''' - address = safe_decode(address, 'utf-8') - return _parser.parse_address(address, language=language, country=country) diff --git a/python/postal/pyexpand.c b/python/postal/pyexpand.c deleted file mode 100644 index 3db8c207..00000000 --- a/python/postal/pyexpand.c +++ /dev/null @@ -1,346 +0,0 @@ -#include -#include - -#if PY_MAJOR_VERSION >= 3 -#define IS_PY3K -#endif - -struct module_state { - PyObject *error; -}; - - -#ifdef IS_PY3K - #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) -#else - #define GETSTATE(m) (&_state) - static struct module_state _state; -#endif - - -static PyObject *py_expand(PyObject *self, PyObject *args, PyObject *keywords) { - PyObject *arg_input; - PyObject *arg_languages; - normalize_options_t options = LIBPOSTAL_DEFAULT_OPTIONS; - - PyObject *result = NULL; - - static char *kwlist[] = {"address", - "languages", - "address_components", - "latin_ascii", - "transliterate", - "strip_accents", - "decompose", - "lowercase", - "trim_string", - "replace_word_hyphens", - "delete_word_hyphens", - "replace_numeric_hyphens", - "delete_numeric_hyphens", - "split_alpha_from_numeric", - "delete_final_periods", - "delete_acronym_periods", - "drop_english_possessives", - "delete_apostrophes", - "expand_numex", - "roman_numerals", - NULL - }; - - uint32_t address_components = options.address_components; - uint32_t latin_ascii = options.latin_ascii; - uint32_t transliterate = options.transliterate; - uint32_t strip_accents = options.strip_accents; - uint32_t decompose = options.decompose; - uint32_t lowercase = options.lowercase; - uint32_t trim_string = options.trim_string; - uint32_t replace_word_hyphens = options.replace_word_hyphens; - uint32_t delete_word_hyphens = options.delete_word_hyphens; - uint32_t replace_numeric_hyphens = options.replace_numeric_hyphens; - uint32_t delete_numeric_hyphens = options.delete_numeric_hyphens; - uint32_t split_alpha_from_numeric = options.split_alpha_from_numeric; - uint32_t delete_final_periods = options.delete_final_periods; - uint32_t delete_acronym_periods = options.delete_acronym_periods; - uint32_t expand_numex = options.expand_numex; - uint32_t roman_numerals = options.roman_numerals; - - if (!PyArg_ParseTupleAndKeywords(args, keywords, - "O|OHIIIIIIIIIIIIIIIIII:pyexpand", kwlist, - &arg_input, &arg_languages, - &address_components, - &latin_ascii, - &transliterate, - &strip_accents, - &decompose, - &lowercase, - &trim_string, - &replace_word_hyphens, - &delete_word_hyphens, - &replace_numeric_hyphens, - &delete_numeric_hyphens, - &split_alpha_from_numeric, - &delete_final_periods, - &delete_acronym_periods, - &expand_numex, - &roman_numerals - )) { - return 0; - } - - - options.address_components = address_components; - options.latin_ascii = latin_ascii; - options.transliterate = transliterate; - options.strip_accents = strip_accents; - options.decompose = decompose; - options.lowercase = lowercase; - options.trim_string = trim_string; - options.replace_word_hyphens = replace_word_hyphens; - options.delete_word_hyphens = delete_word_hyphens; - options.replace_numeric_hyphens = replace_numeric_hyphens; - options.delete_numeric_hyphens = delete_numeric_hyphens; - options.split_alpha_from_numeric = split_alpha_from_numeric; - options.delete_final_periods = delete_final_periods; - options.delete_acronym_periods = delete_acronym_periods; - options.expand_numex = expand_numex; - options.roman_numerals = roman_numerals; - - PyObject *unistr_input = PyUnicode_FromObject(arg_input); - if (unistr_input == NULL) { - PyErr_SetString(PyExc_TypeError, - "Input could not be converted to unicode"); - return 0; - } - - char *input = NULL; - - #ifdef IS_PY3K - // Python 3 encoding, supported by Python 3.3+ - - input = PyUnicode_AsUTF8(unistr_input); - - #else - // Python 2 encoding - - PyObject *str_input = PyUnicode_AsEncodedString(unistr_input, "utf-8", "strict"); - if (str_input == NULL) { - PyErr_SetString(PyExc_TypeError, - "Input could not be utf-8 encoded"); - return 0; - } - - input = PyBytes_AsString(str_input); - #endif - - if (input == NULL) { - goto exit_decref_str; - } - - char **languages = NULL; - size_t num_languages = 0; - - if (PySequence_Check(arg_languages)) { - PyObject *seq = PySequence_Fast(arg_languages, "Expected a sequence"); - Py_ssize_t len_languages = PySequence_Length(arg_languages); - - if (len_languages > 0) { - languages = malloc(len_languages * sizeof(char *)); - if (languages == NULL) { - goto exit_decref_str; - } - - char *language = NULL; - - for (int i = 0; i < len_languages; i++) { - PyObject *item = PySequence_Fast_GET_ITEM(seq, i); - - language = NULL; - - #if IS_PY3K - - if (PyBytes_Check(item)) { - language = PyBytes_AsString(item); - } - - #else - - if (PyString_Check(item)) { - language = PyString_AsString(item); - } - - #endif - - if (language != NULL && item != Py_None) { - if (strlen(language) >= MAX_LANGUAGE_LEN) { - PyErr_SetString(PyExc_TypeError, "language was longer than a language code"); - free(languages); - Py_DECREF(seq); - goto exit_decref_str; - } - languages[num_languages] = strdup(language); - num_languages++; - } - - } - - if (num_languages > 0) { - options.languages = languages; - options.num_languages = (int)num_languages; - } else { - free(languages); - languages = NULL; - } - - } - - Py_DECREF(seq); - } - - if (languages == NULL) { - PyErr_SetString(PyExc_TypeError, "Must specify languages=[list of language codes] to expand_address"); - goto exit_decref_str; - } - - - size_t num_expansions = 0; - char **expansions = expand_address(input, options, &num_expansions); - - if (languages != NULL) { - for (int i = 0; i < num_languages; i++) { - free(languages[i]); - } - free(languages); - } - - if (expansions == NULL) { - goto exit_decref_str; - } - - result = PyList_New((Py_ssize_t)num_expansions); - if (!result) { - goto exit_free_expansions; - } - - for (int i = 0; i < num_expansions; i++) { - char *expansion = expansions[i]; - PyObject *u = PyUnicode_DecodeUTF8((const char *)expansion, strlen(expansion), "strict"); - if (u == NULL) { - Py_DECREF(result); - goto exit_free_expansions; - } - // Note: PyList_SetItem steals a reference, so don't worry about DECREF - PyList_SetItem(result, (Py_ssize_t)i, u); - } - -exit_free_expansions: - for (int i = 0; i < num_expansions; i++) { - free(expansions[i]); - } - free(expansions); -exit_decref_str: - #ifndef IS_PY3K - Py_XDECREF(str_input); - #endif -exit_decref_unistr: - Py_XDECREF(unistr_input); - - return result; -} - -static PyMethodDef expand_methods[] = { - {"expand_address", (PyCFunction)py_expand, METH_VARARGS | METH_KEYWORDS, "expand_address(text, **kw)"}, - {NULL, NULL}, -}; - - - -#ifdef IS_PY3K - -static int expand_traverse(PyObject *m, visitproc visit, void *arg) { - Py_VISIT(GETSTATE(m)->error); - return 0; -} - -static int expand_clear(PyObject *m) { - Py_CLEAR(GETSTATE(m)->error); - libpostal_teardown(); - return 0; -} - -static struct PyModuleDef module_def = { - PyModuleDef_HEAD_INIT, - "_expand", - NULL, - sizeof(struct module_state), - expand_methods, - NULL, - expand_traverse, - expand_clear, - NULL -}; - -#define INITERROR return NULL - -PyObject * -PyInit_expand(void) { - -#else - -#define INITERROR return - -void cleanup_libpostal(void) { - libpostal_teardown(); -} - -void -init_expand(void) { - -#endif - -#ifdef IS_PY3K - PyObject *module = PyModule_Create(&module_def); -#else - PyObject *module = Py_InitModule("_expand", expand_methods); -#endif - - if (module == NULL) { - INITERROR; - } - struct module_state *st = GETSTATE(module); - - st->error = PyErr_NewException("_expand.Error", NULL, NULL); - if (st->error == NULL) { - Py_DECREF(module); - INITERROR; - } - - if (!libpostal_setup()) { - PyErr_SetString(PyExc_TypeError, - "Error loading libpostal"); - } - - PyModule_AddIntConstant(module, "ADDRESS_ANY", ADDRESS_ANY); - PyModule_AddIntConstant(module, "ADDRESS_NAME", ADDRESS_NAME); - PyModule_AddIntConstant(module, "ADDRESS_HOUSE_NUMBER", ADDRESS_HOUSE_NUMBER); - PyModule_AddIntConstant(module, "ADDRESS_STREET", ADDRESS_STREET); - PyModule_AddIntConstant(module, "ADDRESS_UNIT", ADDRESS_UNIT); - PyModule_AddIntConstant(module, "ADDRESS_LOCALITY", ADDRESS_LOCALITY); - PyModule_AddIntConstant(module, "ADDRESS_ADMIN1", ADDRESS_ADMIN1); - PyModule_AddIntConstant(module, "ADDRESS_ADMIN2", ADDRESS_ADMIN2); - PyModule_AddIntConstant(module, "ADDRESS_ADMIN3", ADDRESS_ADMIN3); - PyModule_AddIntConstant(module, "ADDRESS_ADMIN4", ADDRESS_ADMIN4); - PyModule_AddIntConstant(module, "ADDRESS_ADMIN_OTHER", ADDRESS_ADMIN_OTHER); - PyModule_AddIntConstant(module, "ADDRESS_COUNTRY", ADDRESS_COUNTRY); - PyModule_AddIntConstant(module, "ADDRESS_NEIGHBORHOOD", ADDRESS_NEIGHBORHOOD); - PyModule_AddIntConstant(module, "ADDRESS_ALL", ADDRESS_ALL); - -#ifndef IS_PY3K - Py_AtExit(&cleanup_libpostal); -#endif - -#if IS_PY3K - return module; -#endif -} - diff --git a/python/postal/pyparser.c b/python/postal/pyparser.c deleted file mode 100644 index 24c72220..00000000 --- a/python/postal/pyparser.c +++ /dev/null @@ -1,299 +0,0 @@ -#include -#include - -#if PY_MAJOR_VERSION >= 3 -#define IS_PY3K -#endif - -struct module_state { - PyObject *error; -}; - - -#ifdef IS_PY3K - #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) -#else - #define GETSTATE(m) (&_state) - static struct module_state _state; -#endif - - -static PyObject *py_parse_address(PyObject *self, PyObject *args, PyObject *keywords) { - PyObject *arg_input; - PyObject *arg_language = Py_None; - PyObject *arg_country = Py_None; - - PyObject *result = NULL; - - static char *kwlist[] = {"address", - "language", - "country", - NULL - }; - - - if (!PyArg_ParseTupleAndKeywords(args, keywords, - "O|OO:pyparser", kwlist, - &arg_input, &arg_language, - &arg_country - )) { - return 0; - } - - PyObject *unistr_input = PyUnicode_FromObject(arg_input); - if (unistr_input == NULL) { - PyErr_SetString(PyExc_TypeError, - "Input could not be converted to unicode"); - return 0; - } - - char *input = NULL; - - #ifdef IS_PY3K - // Python 3 encoding, supported by Python 3.3+ - - input = PyUnicode_AsUTF8(unistr_input); - - #else - // Python 2 encoding - - PyObject *str_input = PyUnicode_AsEncodedString(unistr_input, "utf-8", "strict"); - if (str_input == NULL) { - PyErr_SetString(PyExc_TypeError, - "Input could not be utf-8 encoded"); - goto exit_decref_input_unistr; - } - - input = PyBytes_AsString(str_input); - #endif - - if (input == NULL) { - goto exit_decref_input_str; - } - - char *language = NULL; - - PyObject *unistr_language = Py_None; - PyObject *str_language = Py_None; - - if (arg_language != Py_None) { - unistr_language = PyUnicode_FromObject(arg_language); - if (unistr_language == NULL) { - PyErr_SetString(PyExc_TypeError, - "Language could not be converted to unicode"); - } - - #ifdef IS_PY3K - // Python 3 encoding, supported by Python 3.3+ - - language = PyUnicode_AsUTF8(unistr_language); - - #else - // Python 2 encoding - - PyObject *str_language = PyUnicode_AsEncodedString(unistr_language, "utf-8", "strict"); - if (str_language == NULL) { - PyErr_SetString(PyExc_TypeError, - "Language could not be utf-8 encoded"); - goto exit_decref_language_unistr; - } - - language = PyBytes_AsString(str_language); - #endif - - if (language == NULL) { - goto exit_decref_language_str; - } - } - - char *country = NULL; - PyObject *unistr_country = Py_None; - PyObject *str_country = Py_None; - - if (arg_country != Py_None) { - unistr_country = PyUnicode_FromObject(arg_country); - if (unistr_country == NULL) { - PyErr_SetString(PyExc_TypeError, - "Country could not be converted to unicode"); - } - - #ifdef IS_PY3K - // Python 3 encoding, supported by Python 3.3+ - - country = PyUnicode_AsUTF8(unistr_country); - - #else - // Python 2 encoding - - PyObject *str_country = PyUnicode_AsEncodedString(unistr_country, "utf-8", "strict"); - if (str_country == NULL) { - PyErr_SetString(PyExc_TypeError, - "Country could not be utf-8 encoded"); - goto exit_decref_country_unistr; - } - - country = PyBytes_AsString(str_country); - #endif - - if (country == NULL) { - goto exit_decref_country_str; - } - } - - address_parser_options_t options = LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS; - options.language = language; - options.country = country; - - address_parser_response_t *parsed = parse_address(input, options); - if (parsed == NULL) { - goto exit_decref_country_str; - } - - result = PyList_New((Py_ssize_t)parsed->num_components); - if (!result) { - goto exit_destroy_response; - } - - for (int i = 0; i < parsed->num_components; i++) { - char *component = parsed->components[i]; - char *label = parsed->labels[i]; - PyObject *component_unicode = PyUnicode_DecodeUTF8((const char *)component, strlen(component), "strict"); - if (component_unicode == NULL) { - Py_DECREF(result); - goto exit_destroy_response; - } - - PyObject *label_unicode = PyUnicode_DecodeUTF8((const char *)label, strlen(label), "strict"); - if (label_unicode == NULL) { - Py_DECREF(component_unicode); - Py_DECREF(result); - goto exit_destroy_response; - } - PyObject *tuple = Py_BuildValue("(OO)", component_unicode, label_unicode); - if (tuple == NULL) { - Py_DECREF(component_unicode); - Py_DECREF(label_unicode); - goto exit_destroy_response; - } - - // Note: PyList_SetItem steals a reference, so don't worry about DECREF - PyList_SetItem(result, (Py_ssize_t)i, tuple); - - Py_DECREF(component_unicode); - Py_DECREF(label_unicode); - } - -exit_destroy_response: - address_parser_response_destroy(parsed); -exit_decref_country_str: - #ifndef IS_PY3K - if (str_country != Py_None) { - Py_XDECREF(str_country); - } - #endif -exit_decref_country_unistr: - if (unistr_country != Py_None) { - Py_XDECREF(unistr_country); - } -exit_decref_language_str: - #ifndef IS_PY3K - if (str_language != Py_None) { - Py_XDECREF(str_language); - } - #endif -exit_decref_language_unistr: - if (unistr_language != Py_None) { - Py_XDECREF(unistr_language); - } -exit_decref_input_str: - #ifndef IS_PY3K - Py_XDECREF(str_input); - #endif -exit_decref_input_unistr: - Py_XDECREF(unistr_input); - - return result; -} - -static PyMethodDef parser_methods[] = { - {"parse_address", (PyCFunction)py_parse_address, METH_VARARGS | METH_KEYWORDS, "parse_address(text, language, country)"}, - {NULL, NULL}, -}; - - - -#ifdef IS_PY3K - -static int parser_traverse(PyObject *m, visitproc visit, void *arg) { - Py_VISIT(GETSTATE(m)->error); - return 0; -} - -static int parser_clear(PyObject *m) { - Py_CLEAR(GETSTATE(m)->error); - libpostal_teardown(); - libpostal_teardown_parser(); - return 0; -} - -static struct PyModuleDef module_def = { - PyModuleDef_HEAD_INIT, - "_parser", - NULL, - sizeof(struct module_state), - parser_methods, - NULL, - parser_traverse, - parser_clear, - NULL -}; - -#define INITERROR return NULL - -PyObject * -PyInit_parser(void) { -#else - -#define INITERROR return - -void cleanup_libpostal(void) { - libpostal_teardown(); - libpostal_teardown_parser(); -} - -void -init_parser(void) { -#endif - -#ifdef IS_PY3K - PyObject *module = PyModule_Create(&module_def); -#else - PyObject *module = Py_InitModule("_parser", parser_methods); -#endif - - if (module == NULL) { - INITERROR; - } - struct module_state *st = GETSTATE(module); - - st->error = PyErr_NewException("_parser.Error", NULL, NULL); - if (st->error == NULL) { - Py_DECREF(module); - INITERROR; - } - - if (!libpostal_setup() || !libpostal_setup_parser()) { - PyErr_SetString(PyExc_TypeError, - "Error loading libpostal data"); - } - -#ifndef IS_PY3K - Py_AtExit(&cleanup_libpostal); -#endif - - -#ifdef IS_PY3K - return module; -#endif -} - diff --git a/python/postal/text/__init__.py b/python/postal/text/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/python/postal/text/encoding.py b/python/postal/text/encoding.py deleted file mode 100644 index b4bcbd61..00000000 --- a/python/postal/text/encoding.py +++ /dev/null @@ -1,34 +0,0 @@ -import six - -text_type = six.text_type -string_types = six.string_types -binary_type = six.binary_type - - -def safe_decode(value, encoding='utf-8', errors='strict'): - if isinstance(value, text_type): - return value - - if isinstance(value, (string_types, binary_type)): - return value.decode(encoding, errors) - else: - return binary_type(value).decode(encoding, errors) - - -def safe_encode(value, incoming=None, encoding='utf-8', errors='strict'): - if not isinstance(value, (string_types, binary_type)): - return binary_type(value) - - if isinstance(value, text_type): - return value.encode(encoding, errors) - else: - if hasattr(incoming, 'lower'): - incoming = incoming.lower() - if hasattr(encoding, 'lower'): - encoding = encoding.lower() - - if value and encoding != incoming: - value = safe_decode(value, encoding, errors) - return value.encode(encoding, errors) - else: - return value diff --git a/python/postal/text/normalize.py b/python/postal/text/normalize.py deleted file mode 100644 index 9993a5eb..00000000 --- a/python/postal/text/normalize.py +++ /dev/null @@ -1,84 +0,0 @@ -# -*- coding: utf-8 -*- -from postal.text import _normalize -from postal.text.tokenize import tokenize_raw -from postal.text.token_types import token_types - -from postal.text.encoding import safe_decode - -# String options -NORMALIZE_STRING_LATIN_ASCII = _normalize.NORMALIZE_STRING_LATIN_ASCII -NORMALIZE_STRING_TRANSLITERATE = _normalize.NORMALIZE_STRING_TRANSLITERATE -NORMALIZE_STRING_STRIP_ACCENTS = _normalize.NORMALIZE_STRING_STRIP_ACCENTS -NORMALIZE_STRING_DECOMPOSE = _normalize.NORMALIZE_STRING_DECOMPOSE -NORMALIZE_STRING_LOWERCASE = _normalize.NORMALIZE_STRING_LOWERCASE -NORMALIZE_STRING_TRIM = _normalize.NORMALIZE_STRING_TRIM -NORMALIZE_STRING_REPLACE_HYPHENS = _normalize.NORMALIZE_STRING_REPLACE_HYPHENS - -DEFAULT_STRING_OPTIONS = NORMALIZE_STRING_LATIN_ASCII | \ - NORMALIZE_STRING_DECOMPOSE | \ - NORMALIZE_STRING_TRIM | \ - NORMALIZE_STRING_REPLACE_HYPHENS | \ - NORMALIZE_STRING_STRIP_ACCENTS | \ - NORMALIZE_STRING_LOWERCASE - -# Token options -NORMALIZE_TOKEN_REPLACE_HYPHENS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS -NORMALIZE_TOKEN_DELETE_HYPHENS = _normalize.NORMALIZE_TOKEN_DELETE_HYPHENS -NORMALIZE_TOKEN_DELETE_FINAL_PERIOD = _normalize.NORMALIZE_TOKEN_DELETE_FINAL_PERIOD -NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS = _normalize.NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS -NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES = _normalize.NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES -NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE = _normalize.NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE -NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC = _normalize.NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC -NORMALIZE_TOKEN_REPLACE_DIGITS = _normalize.NORMALIZE_TOKEN_REPLACE_DIGITS - -DEFAULT_TOKEN_OPTIONS = NORMALIZE_TOKEN_REPLACE_HYPHENS | \ - NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \ - NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \ - NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \ - NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE | \ - NORMALIZE_TOKEN_REPLACE_DIGITS - - -def remove_parens(tokens): - new_tokens = [] - open_parens = 0 - for t, c in tokens: - if c == token_types.PUNCT_OPEN: - open_parens += 1 - elif c == token_types.PUNCT_CLOSE: - if open_parens > 0: - open_parens -= 1 - elif open_parens <= 0: - new_tokens.append((t, c)) - return new_tokens - - -def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS, - token_options=DEFAULT_TOKEN_OPTIONS, - strip_parentheticals=True): - ''' - Normalizes a string, tokenizes, and normalizes each token - with string and token-level options. - - This version only uses libpostal's deterministic normalizations - i.e. methods with a single output. The string tree version will - return multiple normalized strings, each with tokens. - - Usage: - normalized_tokens(u'St.-Barthélemy') - ''' - s = safe_decode(s) - if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII: - normalized = _normalize.normalize_string_latin(s, string_options) - else: - normalized = _normalize.normalize_string_utf8(s, string_options) - - # Tuples of (offset, len, type) - raw_tokens = tokenize_raw(normalized) - tokens = [(_normalize.normalize_token(normalized, t, token_options), - token_types.from_id(t[-1])) for t in raw_tokens] - - if strip_parentheticals: - return remove_parens(tokens) - else: - return tokens diff --git a/python/postal/text/pynormalize.c b/python/postal/text/pynormalize.c deleted file mode 100644 index 46b959f9..00000000 --- a/python/postal/text/pynormalize.c +++ /dev/null @@ -1,328 +0,0 @@ -#include - -#include "src/normalize.h" -#include "src/transliterate.h" - -#if PY_MAJOR_VERSION >= 3 -#define IS_PY3K -#endif - -struct module_state { - PyObject *error; -}; - - -#ifdef IS_PY3K - #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) -#else - #define GETSTATE(m) (&_state) - static struct module_state _state; -#endif - - - -static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args) -{ - PyObject *arg1; - uint64_t options; - if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) { - return 0; - } - - PyObject *unistr = PyUnicode_FromObject(arg1); - if (unistr == NULL) { - PyErr_SetString(PyExc_TypeError, - "Parameter could not be converted to unicode in scanner"); - return 0; - } - - #ifdef IS_PY3K - // Python 3 encoding, supported by Python 3.3+ - - char *input = PyUnicode_AsUTF8(unistr); - - #else - // Python 2 encoding - - PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict"); - if (str == NULL) { - PyErr_SetString(PyExc_TypeError, - "Parameter could not be utf-8 encoded"); - goto exit_decref_unistr; - } - - char *input = PyBytes_AsString(str); - - #endif - - if (input == NULL) { - goto exit_decref_str; - } - - char *normalized = normalize_string_utf8(input, options); - - if (normalized == NULL) { - goto exit_decref_str; - } - - PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict"); - free(normalized); - if (result == NULL) { - PyErr_SetString(PyExc_ValueError, - "Result could not be utf-8 decoded"); - goto exit_decref_str; - } - - #ifndef IS_PY3K - Py_XDECREF(str); - #endif - Py_XDECREF(unistr); - - return result; - -exit_decref_str: -#ifndef IS_PY3K - Py_XDECREF(str); -#endif -exit_decref_unistr: - Py_XDECREF(unistr); - return 0; -} - - -static PyObject *py_normalize_string_latin(PyObject *self, PyObject *args) -{ - PyObject *arg1; - uint64_t options; - if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) { - return 0; - } - - PyObject *unistr = PyUnicode_FromObject(arg1); - if (unistr == NULL) { - PyErr_SetString(PyExc_TypeError, - "Parameter could not be converted to unicode in scanner"); - return 0; - } - - #ifdef IS_PY3K - // Python 3 encoding, supported by Python 3.3+ - - char *input = PyUnicode_AsUTF8(unistr); - - #else - // Python 2 encoding - - PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict"); - if (str == NULL) { - PyErr_SetString(PyExc_TypeError, - "Parameter could not be utf-8 encoded"); - goto exit_decref_unistr; - } - - char *input = PyBytes_AsString(str); - - #endif - - if (input == NULL) { - goto exit_decref_str; - } - - char *normalized = normalize_string_latin(input, strlen(input), options); - - PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict"); - free(normalized); - if (result == NULL) { - PyErr_SetString(PyExc_ValueError, - "Result could not be utf-8 decoded"); - goto exit_decref_str; - } - - #ifndef IS_PY3K - Py_XDECREF(str); - #endif - Py_XDECREF(unistr); - - return result; - -exit_decref_str: -#ifndef IS_PY3K - Py_XDECREF(str); -#endif -exit_decref_unistr: - Py_XDECREF(unistr); - return 0; -} - - - -static PyObject *py_normalize_token(PyObject *self, PyObject *args) -{ - PyObject *s; - - uint32_t offset; - uint32_t len; - uint16_t type; - - uint64_t options; - if (!PyArg_ParseTuple(args, "O(IIH)K:normalize", &s, &offset, &len, &type, &options)) { - PyErr_SetString(PyExc_TypeError, - "Error parsing arguments"); - return 0; - } - - token_t token = (token_t){(size_t)offset, (size_t)len, type}; - - PyObject *unistr = PyUnicode_FromObject(s); - if (unistr == NULL) { - PyErr_SetString(PyExc_TypeError, - "Parameter could not be converted to unicode in scanner"); - return 0; - } - - #ifdef IS_PY3K - // Python 3 encoding, supported by Python 3.3+ - - char *input = PyUnicode_AsUTF8(unistr); - - #else - // Python 2 encoding - - PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict"); - if (str == NULL) { - PyErr_SetString(PyExc_ValueError, - "Parameter could not be utf-8 encoded"); - goto exit_decref_unistr; - } - - char *input = PyBytes_AsString(str); - - #endif - - if (input == NULL) { - goto exit_decref_str; - } - - char_array *token_buffer = char_array_new_size(token.len); - - add_normalized_token(token_buffer, input, token, options); - char *token_str = char_array_get_string(token_buffer); - PyObject *result = PyUnicode_DecodeUTF8((const char *)token_str, token_buffer->n - 1, "strict"); - - if (result == NULL) { - PyErr_SetString(PyExc_ValueError, - "Error decoding token"); - char_array_destroy(token_buffer); - goto exit_decref_str; - } - - char_array_destroy(token_buffer); - - #ifndef IS_PY3K - Py_XDECREF(str); - #endif - Py_XDECREF(unistr); - - return result; - -exit_decref_str: -#ifndef IS_PY3K - Py_XDECREF(str); -#endif -exit_decref_unistr: - Py_XDECREF(unistr); - return 0; -} - -static PyMethodDef normalize_methods[] = { - {"normalize_string_utf8", (PyCFunction)py_normalize_string_utf8, METH_VARARGS, "normalize_string_utf8(input, options)"}, - {"normalize_string_latin", (PyCFunction)py_normalize_string_latin, METH_VARARGS, "normalize_string_latin(input, options)"}, - {"normalize_token", (PyCFunction)py_normalize_token, METH_VARARGS, "normalize_token(input, options)"}, - {NULL, NULL}, -}; - - - -#ifdef IS_PY3K - -static int normalize_traverse(PyObject *m, visitproc visit, void *arg) { - Py_VISIT(GETSTATE(m)->error); - return 0; -} - -static int normalize_clear(PyObject *m) { - Py_CLEAR(GETSTATE(m)->error); - return 0; -} - - -static struct PyModuleDef module_def = { - PyModuleDef_HEAD_INIT, - "_normalize", - NULL, - sizeof(struct module_state), - normalize_methods, - NULL, - normalize_traverse, - normalize_clear, - NULL -}; - -#define INITERROR return NULL - -PyObject * -PyInit_normalize(void) { -#else -#define INITERROR return - -void -init_normalize(void) { -#endif - -#ifdef IS_PY3K - PyObject *module = PyModule_Create(&module_def); -#else - PyObject *module = Py_InitModule("_normalize", normalize_methods); -#endif - - if (module == NULL) - INITERROR; - struct module_state *st = GETSTATE(module); - - st->error = PyErr_NewException("_normalize.Error", NULL, NULL); - if (st->error == NULL) { - Py_DECREF(module); - INITERROR; - } - - if (!transliteration_module_setup(NULL)) { - PyErr_SetString(PyExc_RuntimeError, - "Could not load transliterate module"); - Py_DECREF(module); - INITERROR; - } - - - PyModule_AddObject(module, "NORMALIZE_STRING_LATIN_ASCII", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LATIN_ASCII)); - PyModule_AddObject(module, "NORMALIZE_STRING_TRANSLITERATE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRANSLITERATE)); - PyModule_AddObject(module, "NORMALIZE_STRING_STRIP_ACCENTS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_STRIP_ACCENTS)); - PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_DECOMPOSE)); - PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LOWERCASE)); - PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRIM)); - PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_REPLACE_HYPHENS)); - - - PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_HYPHENS)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_HYPHENS)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_FINAL_PERIOD", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_FINAL_PERIOD)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_DIGITS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_DIGITS)); - - -#if PY_MAJOR_VERSION >= 3 - return module; -#endif -} \ No newline at end of file diff --git a/python/postal/text/pytokenize.c b/python/postal/text/pytokenize.c deleted file mode 100644 index 7986bae3..00000000 --- a/python/postal/text/pytokenize.c +++ /dev/null @@ -1,164 +0,0 @@ -#include - -#include "src/scanner.h" - -#if PY_MAJOR_VERSION >= 3 -#define IS_PY3K -#endif - -struct module_state { - PyObject *error; -}; - - -#ifdef IS_PY3K - #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) -#else - #define GETSTATE(m) (&_state) - static struct module_state _state; -#endif - - -static PyObject *py_tokenize(PyObject *self, PyObject *args) -{ - PyObject *arg1; - if (!PyArg_ParseTuple(args, "O:tokenize", &arg1)) { - return 0; - } - - PyObject *unistr = PyUnicode_FromObject(arg1); - if (unistr == NULL) { - PyErr_SetString(PyExc_TypeError, - "Parameter could not be converted to unicode in scanner"); - return 0; - } - - #ifdef IS_PY3K - // Python 3 encoding, supported by Python 3.3+ - - char *input = PyUnicode_AsUTF8(unistr); - - #else - // Python 2 encoding - - PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict"); - if (str == NULL) { - PyErr_SetString(PyExc_TypeError, - "Parameter could not be utf-8 encoded"); - goto error_decref_unistr; - } - - char *input = PyBytes_AsString(str); - - #endif - - - if (input == NULL) { - goto error_decref_str; - } - - token_array *tokens = tokenize(input); - if (tokens == NULL) { - goto error_decref_str; - } - - PyObject *result = PyTuple_New(tokens->n); - if (!result) { - token_array_destroy(tokens); - goto error_decref_str; - return 0; - } - - PyObject *tuple; - - token_t token; - for (size_t i = 0; i < tokens->n; i++) { - token = tokens->a[i]; - tuple = Py_BuildValue("III", token.offset, token.len, token.type); - if (PyTuple_SetItem(result, i, tuple) < 0) { - token_array_destroy(tokens); - goto error_decref_str; - } - } - - #ifndef IS_PY3K - Py_XDECREF(str); - #endif - Py_XDECREF(unistr); - - token_array_destroy(tokens); - - return result; - -error_decref_str: -#ifndef IS_PY3K - Py_XDECREF(str); -#endif -error_decref_unistr: - Py_XDECREF(unistr); - return 0; -} - -static PyMethodDef tokenize_methods[] = { - {"tokenize", (PyCFunction)py_tokenize, METH_VARARGS, "tokenize(text)"}, - {NULL, NULL}, -}; - - - -#ifdef IS_PY3K - -static int tokenize_traverse(PyObject *m, visitproc visit, void *arg) { - Py_VISIT(GETSTATE(m)->error); - return 0; -} - -static int tokenize_clear(PyObject *m) { - Py_CLEAR(GETSTATE(m)->error); - return 0; -} - - -static struct PyModuleDef module_def = { - PyModuleDef_HEAD_INIT, - "_tokenize", - NULL, - sizeof(struct module_state), - tokenize_methods, - NULL, - tokenize_traverse, - tokenize_clear, - NULL -}; - -#define INITERROR return NULL - -PyObject * -PyInit_tokenize(void) { -#else -#define INITERROR return - -void -init_tokenize(void) { -#endif - -#ifdef IS_PY3K - PyObject *module = PyModule_Create(&module_def); -#else - PyObject *module = Py_InitModule("_tokenize", tokenize_methods); -#endif - - if (module == NULL) - INITERROR; - struct module_state *st = GETSTATE(module); - - st->error = PyErr_NewException("_tokenize.Error", NULL, NULL); - if (st->error == NULL) { - Py_DECREF(module); - INITERROR; - } - -#if PY_MAJOR_VERSION >= 3 - return module; -#endif -} \ No newline at end of file diff --git a/python/postal/text/token_types.py b/python/postal/text/token_types.py deleted file mode 100644 index 2fe433e5..00000000 --- a/python/postal/text/token_types.py +++ /dev/null @@ -1,68 +0,0 @@ -from postal.utils.enum import Enum, EnumValue - - -class token_types(Enum): - # Word types - WORD = EnumValue(1) - ABBREVIATION = EnumValue(2) - IDEOGRAPHIC_CHAR = EnumValue(3) - HANGUL_SYLLABLE = EnumValue(4) - ACRONYM = EnumValue(5) - - # Special tokens - EMAIL = EnumValue(20) - URL = EnumValue(21) - US_PHONE = EnumValue(22) - INTL_PHONE = EnumValue(23) - - # Numbers and numeric types - NUMERIC = EnumValue(50) - ORDINAL = EnumValue(51) - ROMAN_NUMERAL = EnumValue(52) - IDEOGRAPHIC_NUMBER = EnumValue(53) - - # Punctuation types, may separate a phrase - PERIOD = EnumValue(100) - EXCLAMATION = EnumValue(101) - QUESTION_MARK = EnumValue(102) - COMMA = EnumValue(103) - COLON = EnumValue(104) - SEMICOLON = EnumValue(105) - PLUS = EnumValue(106) - AMPERSAND = EnumValue(107) - AT_SIGN = EnumValue(108) - POUND = EnumValue(109) - ELLIPSIS = EnumValue(110) - DASH = EnumValue(111) - BREAKING_DASH = EnumValue(112) - HYPHEN = EnumValue(113) - PUNCT_OPEN = EnumValue(114) - PUNCT_CLOSE = EnumValue(115) - DOUBLE_QUOTE = EnumValue(119) - SINGLE_QUOTE = EnumValue(120) - OPEN_QUOTE = EnumValue(121) - CLOSE_QUOTE = EnumValue(122) - SLASH = EnumValue(124) - BACKSLASH = EnumValue(125) - GREATER_THAN = EnumValue(126) - LESS_THAN = EnumValue(127) - - # Non-letters and whitespace - OTHER = EnumValue(200) - WHITESPACE = EnumValue(300) - NEWLINE = EnumValue(301) - - WORD_TOKEN_TYPES = set([ - WORD, - ABBREVIATION, - IDEOGRAPHIC_CHAR, - HANGUL_SYLLABLE, - ACRONYM - ]) - - NUMERIC_TOKEN_TYPES = set([ - NUMERIC, - ORDINAL, - ROMAN_NUMERAL, - IDEOGRAPHIC_NUMBER, - ]) diff --git a/python/postal/text/tokenize.py b/python/postal/text/tokenize.py deleted file mode 100644 index e2ed77b5..00000000 --- a/python/postal/text/tokenize.py +++ /dev/null @@ -1,14 +0,0 @@ -from postal.text.encoding import safe_encode, safe_decode -from postal.text import _tokenize -from postal.text.token_types import token_types - - -def tokenize_raw(s): - return _tokenize.tokenize(safe_decode(s)) - - -def tokenize(s): - u = safe_decode(s) - s = safe_encode(s) - return [(safe_decode(s[start:start + length]), token_types.from_id(token_type)) - for start, length, token_type in _tokenize.tokenize(u)] diff --git a/python/postal/utils/__init__.py b/python/postal/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/python/postal/utils/enum.py b/python/postal/utils/enum.py deleted file mode 100644 index 504ad70a..00000000 --- a/python/postal/utils/enum.py +++ /dev/null @@ -1,62 +0,0 @@ - -class EnumValue(object): - def __init__(self, value, name=None): - self.value = value - self.name = name - - def __hash__(self): - return self.value - - def __cmp__(self, other): - if isinstance(other, EnumValue): - return self.value.__cmp__(other.value) - else: - return self.value.__cmp__(other) - - def __unicode__(self): - return self.name - - def __str__(self): - return self.name - - def __repr__(self): - return self.name - - -class EnumMeta(type): - def __init__(self, name, bases, dict_): - self.registry = self.registry.copy() - self.name_registry = self.name_registry.copy() - for k, v in dict_.iteritems(): - if isinstance(v, EnumValue) and v not in self.registry: - if v.name is None: - v.name = k - self.registry[v.value] = v - self.name_registry[v.name] = v - return super(EnumMeta, self).__init__(name, bases, dict_) - - def __iter__(self): - return self.registry.itervalues() - - def __getitem__(self, key): - return self.registry[key] - - -class Enum(object): - __metaclass__ = EnumMeta - registry = {} - name_registry = {} - - @classmethod - def from_id(cls, value): - try: - return cls.registry[value] - except KeyError: - raise ValueError('Invalid value for {}: {}'.format(cls.__name__, value)) - - @classmethod - def from_string(cls, name): - try: - return cls.name_registry[name] - except KeyError: - raise ValueError('Invalid name for {}: {}'.format(cls.__name__, name)) diff --git a/setup.py b/setup.py deleted file mode 100644 index 5b8f8543..00000000 --- a/setup.py +++ /dev/null @@ -1,78 +0,0 @@ -import argparse -import os -import subprocess -import sys - -from setuptools import setup, Extension, Command, find_packages -from setuptools.command.build_py import build_py -from setuptools.command.build_ext import build_ext -from setuptools.command.install import install -from distutils.errors import DistutilsArgError - -SRC_DIR = 'src' -this_dir = os.path.realpath(os.path.dirname(__file__)) - - -def main(): - setup( - name='pypostal', - version='0.2', - install_requires=[ - 'six', - ], - ext_modules=[ - Extension('postal.text._tokenize', - sources=[os.path.join(SRC_DIR, f) - for f in ('scanner.c', - 'string_utils.c', - 'tokens.c', - 'utf8proc/utf8proc.c', - ) - ] + ['python/postal/text/pytokenize.c'], - include_dirs=[this_dir], - extra_compile_args=['-O0', '-std=c99', - '-Wno-unused-function'], - ), - Extension('postal.text._normalize', - sources=[os.path.join(SRC_DIR, f) - for f in ('normalize.c', - 'string_utils.c', - 'utf8proc/utf8proc.c', - 'tokens.c', - 'unicode_scripts.c', - 'transliterate.c', - 'file_utils.c', - 'trie.c', - 'trie_search.c',) - ] + ['python/postal/text/pynormalize.c'], - include_dirs=[this_dir], - extra_compile_args=['-std=c99', '-DHAVE_CONFIG_H', - '-Wno-unused-function'], - ), - Extension('postal._expand', - sources=['python/postal/pyexpand.c'], - libraries=['postal'], - extra_compile_args=['-std=c99', - '-Wno-unused-function'], - ), - Extension('postal._parser', - sources=['python/postal/pyparser.c'], - libraries=['postal'], - extra_compile_args=['-std=c99', - '-Wno-unused-function'], - ), - ], - packages=find_packages('python'), - package_dir={'': 'python'}, - include_package_data=True, - zip_safe=False, - url='http://mapzen.com', - description='Fast address standardization and deduplication', - license='MIT License', - maintainer='mapzen.com', - maintainer_email='pelias@mapzen.com' - ) - - -if __name__ == '__main__': - main()