[rm] Removing Python bindings from this project, moving to https://github.com/openvenues/pypostal

This commit is contained in:
Al
2016-01-26 02:17:23 -05:00
parent 5077462754
commit cffc7e1034
15 changed files with 0 additions and 1507 deletions

View File

@@ -1,16 +0,0 @@
import _expand
from postal.text.encoding import safe_decode
DEFAULT_LANGUAGES = ('en',)
def expand_address(address, languages=DEFAULT_LANGUAGES, **kw):
'''
@param address: the address as either Unicode or a UTF-8 encoded string
@param languages: a tuple or list of ISO language code strings (e.g. "en", "fr", "de", etc.)
to use in expansion. Default is English. Until automatic language classification
is ready in libpostal, this parameter is required.
'''
address = safe_decode(address, 'utf-8')
return _expand.expand_address(address, languages=languages, **kw)

View File

@@ -1,14 +0,0 @@
import _parser
from postal.text.encoding import safe_decode
DEFAULT_LANGUAGES = ('en',)
def parse_address(address, language=None, country=None):
'''
@param address: the address as either Unicode or a UTF-8 encoded string
@param language (optional): language code
@param country (optional): country code
'''
address = safe_decode(address, 'utf-8')
return _parser.parse_address(address, language=language, country=country)

View File

@@ -1,346 +0,0 @@
#include <Python.h>
#include <libpostal/libpostal.h>
#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif
struct module_state {
PyObject *error;
};
#ifdef IS_PY3K
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
#else
#define GETSTATE(m) (&_state)
static struct module_state _state;
#endif
static PyObject *py_expand(PyObject *self, PyObject *args, PyObject *keywords) {
PyObject *arg_input;
PyObject *arg_languages;
normalize_options_t options = LIBPOSTAL_DEFAULT_OPTIONS;
PyObject *result = NULL;
static char *kwlist[] = {"address",
"languages",
"address_components",
"latin_ascii",
"transliterate",
"strip_accents",
"decompose",
"lowercase",
"trim_string",
"replace_word_hyphens",
"delete_word_hyphens",
"replace_numeric_hyphens",
"delete_numeric_hyphens",
"split_alpha_from_numeric",
"delete_final_periods",
"delete_acronym_periods",
"drop_english_possessives",
"delete_apostrophes",
"expand_numex",
"roman_numerals",
NULL
};
uint32_t address_components = options.address_components;
uint32_t latin_ascii = options.latin_ascii;
uint32_t transliterate = options.transliterate;
uint32_t strip_accents = options.strip_accents;
uint32_t decompose = options.decompose;
uint32_t lowercase = options.lowercase;
uint32_t trim_string = options.trim_string;
uint32_t replace_word_hyphens = options.replace_word_hyphens;
uint32_t delete_word_hyphens = options.delete_word_hyphens;
uint32_t replace_numeric_hyphens = options.replace_numeric_hyphens;
uint32_t delete_numeric_hyphens = options.delete_numeric_hyphens;
uint32_t split_alpha_from_numeric = options.split_alpha_from_numeric;
uint32_t delete_final_periods = options.delete_final_periods;
uint32_t delete_acronym_periods = options.delete_acronym_periods;
uint32_t expand_numex = options.expand_numex;
uint32_t roman_numerals = options.roman_numerals;
if (!PyArg_ParseTupleAndKeywords(args, keywords,
"O|OHIIIIIIIIIIIIIIIIII:pyexpand", kwlist,
&arg_input, &arg_languages,
&address_components,
&latin_ascii,
&transliterate,
&strip_accents,
&decompose,
&lowercase,
&trim_string,
&replace_word_hyphens,
&delete_word_hyphens,
&replace_numeric_hyphens,
&delete_numeric_hyphens,
&split_alpha_from_numeric,
&delete_final_periods,
&delete_acronym_periods,
&expand_numex,
&roman_numerals
)) {
return 0;
}
options.address_components = address_components;
options.latin_ascii = latin_ascii;
options.transliterate = transliterate;
options.strip_accents = strip_accents;
options.decompose = decompose;
options.lowercase = lowercase;
options.trim_string = trim_string;
options.replace_word_hyphens = replace_word_hyphens;
options.delete_word_hyphens = delete_word_hyphens;
options.replace_numeric_hyphens = replace_numeric_hyphens;
options.delete_numeric_hyphens = delete_numeric_hyphens;
options.split_alpha_from_numeric = split_alpha_from_numeric;
options.delete_final_periods = delete_final_periods;
options.delete_acronym_periods = delete_acronym_periods;
options.expand_numex = expand_numex;
options.roman_numerals = roman_numerals;
PyObject *unistr_input = PyUnicode_FromObject(arg_input);
if (unistr_input == NULL) {
PyErr_SetString(PyExc_TypeError,
"Input could not be converted to unicode");
return 0;
}
char *input = NULL;
#ifdef IS_PY3K
// Python 3 encoding, supported by Python 3.3+
input = PyUnicode_AsUTF8(unistr_input);
#else
// Python 2 encoding
PyObject *str_input = PyUnicode_AsEncodedString(unistr_input, "utf-8", "strict");
if (str_input == NULL) {
PyErr_SetString(PyExc_TypeError,
"Input could not be utf-8 encoded");
return 0;
}
input = PyBytes_AsString(str_input);
#endif
if (input == NULL) {
goto exit_decref_str;
}
char **languages = NULL;
size_t num_languages = 0;
if (PySequence_Check(arg_languages)) {
PyObject *seq = PySequence_Fast(arg_languages, "Expected a sequence");
Py_ssize_t len_languages = PySequence_Length(arg_languages);
if (len_languages > 0) {
languages = malloc(len_languages * sizeof(char *));
if (languages == NULL) {
goto exit_decref_str;
}
char *language = NULL;
for (int i = 0; i < len_languages; i++) {
PyObject *item = PySequence_Fast_GET_ITEM(seq, i);
language = NULL;
#if IS_PY3K
if (PyBytes_Check(item)) {
language = PyBytes_AsString(item);
}
#else
if (PyString_Check(item)) {
language = PyString_AsString(item);
}
#endif
if (language != NULL && item != Py_None) {
if (strlen(language) >= MAX_LANGUAGE_LEN) {
PyErr_SetString(PyExc_TypeError, "language was longer than a language code");
free(languages);
Py_DECREF(seq);
goto exit_decref_str;
}
languages[num_languages] = strdup(language);
num_languages++;
}
}
if (num_languages > 0) {
options.languages = languages;
options.num_languages = (int)num_languages;
} else {
free(languages);
languages = NULL;
}
}
Py_DECREF(seq);
}
if (languages == NULL) {
PyErr_SetString(PyExc_TypeError, "Must specify languages=[list of language codes] to expand_address");
goto exit_decref_str;
}
size_t num_expansions = 0;
char **expansions = expand_address(input, options, &num_expansions);
if (languages != NULL) {
for (int i = 0; i < num_languages; i++) {
free(languages[i]);
}
free(languages);
}
if (expansions == NULL) {
goto exit_decref_str;
}
result = PyList_New((Py_ssize_t)num_expansions);
if (!result) {
goto exit_free_expansions;
}
for (int i = 0; i < num_expansions; i++) {
char *expansion = expansions[i];
PyObject *u = PyUnicode_DecodeUTF8((const char *)expansion, strlen(expansion), "strict");
if (u == NULL) {
Py_DECREF(result);
goto exit_free_expansions;
}
// Note: PyList_SetItem steals a reference, so don't worry about DECREF
PyList_SetItem(result, (Py_ssize_t)i, u);
}
exit_free_expansions:
for (int i = 0; i < num_expansions; i++) {
free(expansions[i]);
}
free(expansions);
exit_decref_str:
#ifndef IS_PY3K
Py_XDECREF(str_input);
#endif
exit_decref_unistr:
Py_XDECREF(unistr_input);
return result;
}
static PyMethodDef expand_methods[] = {
{"expand_address", (PyCFunction)py_expand, METH_VARARGS | METH_KEYWORDS, "expand_address(text, **kw)"},
{NULL, NULL},
};
#ifdef IS_PY3K
static int expand_traverse(PyObject *m, visitproc visit, void *arg) {
Py_VISIT(GETSTATE(m)->error);
return 0;
}
static int expand_clear(PyObject *m) {
Py_CLEAR(GETSTATE(m)->error);
libpostal_teardown();
return 0;
}
static struct PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"_expand",
NULL,
sizeof(struct module_state),
expand_methods,
NULL,
expand_traverse,
expand_clear,
NULL
};
#define INITERROR return NULL
PyObject *
PyInit_expand(void) {
#else
#define INITERROR return
void cleanup_libpostal(void) {
libpostal_teardown();
}
void
init_expand(void) {
#endif
#ifdef IS_PY3K
PyObject *module = PyModule_Create(&module_def);
#else
PyObject *module = Py_InitModule("_expand", expand_methods);
#endif
if (module == NULL) {
INITERROR;
}
struct module_state *st = GETSTATE(module);
st->error = PyErr_NewException("_expand.Error", NULL, NULL);
if (st->error == NULL) {
Py_DECREF(module);
INITERROR;
}
if (!libpostal_setup()) {
PyErr_SetString(PyExc_TypeError,
"Error loading libpostal");
}
PyModule_AddIntConstant(module, "ADDRESS_ANY", ADDRESS_ANY);
PyModule_AddIntConstant(module, "ADDRESS_NAME", ADDRESS_NAME);
PyModule_AddIntConstant(module, "ADDRESS_HOUSE_NUMBER", ADDRESS_HOUSE_NUMBER);
PyModule_AddIntConstant(module, "ADDRESS_STREET", ADDRESS_STREET);
PyModule_AddIntConstant(module, "ADDRESS_UNIT", ADDRESS_UNIT);
PyModule_AddIntConstant(module, "ADDRESS_LOCALITY", ADDRESS_LOCALITY);
PyModule_AddIntConstant(module, "ADDRESS_ADMIN1", ADDRESS_ADMIN1);
PyModule_AddIntConstant(module, "ADDRESS_ADMIN2", ADDRESS_ADMIN2);
PyModule_AddIntConstant(module, "ADDRESS_ADMIN3", ADDRESS_ADMIN3);
PyModule_AddIntConstant(module, "ADDRESS_ADMIN4", ADDRESS_ADMIN4);
PyModule_AddIntConstant(module, "ADDRESS_ADMIN_OTHER", ADDRESS_ADMIN_OTHER);
PyModule_AddIntConstant(module, "ADDRESS_COUNTRY", ADDRESS_COUNTRY);
PyModule_AddIntConstant(module, "ADDRESS_NEIGHBORHOOD", ADDRESS_NEIGHBORHOOD);
PyModule_AddIntConstant(module, "ADDRESS_ALL", ADDRESS_ALL);
#ifndef IS_PY3K
Py_AtExit(&cleanup_libpostal);
#endif
#if IS_PY3K
return module;
#endif
}

View File

@@ -1,299 +0,0 @@
#include <Python.h>
#include <libpostal/libpostal.h>
#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif
struct module_state {
PyObject *error;
};
#ifdef IS_PY3K
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
#else
#define GETSTATE(m) (&_state)
static struct module_state _state;
#endif
static PyObject *py_parse_address(PyObject *self, PyObject *args, PyObject *keywords) {
PyObject *arg_input;
PyObject *arg_language = Py_None;
PyObject *arg_country = Py_None;
PyObject *result = NULL;
static char *kwlist[] = {"address",
"language",
"country",
NULL
};
if (!PyArg_ParseTupleAndKeywords(args, keywords,
"O|OO:pyparser", kwlist,
&arg_input, &arg_language,
&arg_country
)) {
return 0;
}
PyObject *unistr_input = PyUnicode_FromObject(arg_input);
if (unistr_input == NULL) {
PyErr_SetString(PyExc_TypeError,
"Input could not be converted to unicode");
return 0;
}
char *input = NULL;
#ifdef IS_PY3K
// Python 3 encoding, supported by Python 3.3+
input = PyUnicode_AsUTF8(unistr_input);
#else
// Python 2 encoding
PyObject *str_input = PyUnicode_AsEncodedString(unistr_input, "utf-8", "strict");
if (str_input == NULL) {
PyErr_SetString(PyExc_TypeError,
"Input could not be utf-8 encoded");
goto exit_decref_input_unistr;
}
input = PyBytes_AsString(str_input);
#endif
if (input == NULL) {
goto exit_decref_input_str;
}
char *language = NULL;
PyObject *unistr_language = Py_None;
PyObject *str_language = Py_None;
if (arg_language != Py_None) {
unistr_language = PyUnicode_FromObject(arg_language);
if (unistr_language == NULL) {
PyErr_SetString(PyExc_TypeError,
"Language could not be converted to unicode");
}
#ifdef IS_PY3K
// Python 3 encoding, supported by Python 3.3+
language = PyUnicode_AsUTF8(unistr_language);
#else
// Python 2 encoding
PyObject *str_language = PyUnicode_AsEncodedString(unistr_language, "utf-8", "strict");
if (str_language == NULL) {
PyErr_SetString(PyExc_TypeError,
"Language could not be utf-8 encoded");
goto exit_decref_language_unistr;
}
language = PyBytes_AsString(str_language);
#endif
if (language == NULL) {
goto exit_decref_language_str;
}
}
char *country = NULL;
PyObject *unistr_country = Py_None;
PyObject *str_country = Py_None;
if (arg_country != Py_None) {
unistr_country = PyUnicode_FromObject(arg_country);
if (unistr_country == NULL) {
PyErr_SetString(PyExc_TypeError,
"Country could not be converted to unicode");
}
#ifdef IS_PY3K
// Python 3 encoding, supported by Python 3.3+
country = PyUnicode_AsUTF8(unistr_country);
#else
// Python 2 encoding
PyObject *str_country = PyUnicode_AsEncodedString(unistr_country, "utf-8", "strict");
if (str_country == NULL) {
PyErr_SetString(PyExc_TypeError,
"Country could not be utf-8 encoded");
goto exit_decref_country_unistr;
}
country = PyBytes_AsString(str_country);
#endif
if (country == NULL) {
goto exit_decref_country_str;
}
}
address_parser_options_t options = LIBPOSTAL_ADDRESS_PARSER_DEFAULT_OPTIONS;
options.language = language;
options.country = country;
address_parser_response_t *parsed = parse_address(input, options);
if (parsed == NULL) {
goto exit_decref_country_str;
}
result = PyList_New((Py_ssize_t)parsed->num_components);
if (!result) {
goto exit_destroy_response;
}
for (int i = 0; i < parsed->num_components; i++) {
char *component = parsed->components[i];
char *label = parsed->labels[i];
PyObject *component_unicode = PyUnicode_DecodeUTF8((const char *)component, strlen(component), "strict");
if (component_unicode == NULL) {
Py_DECREF(result);
goto exit_destroy_response;
}
PyObject *label_unicode = PyUnicode_DecodeUTF8((const char *)label, strlen(label), "strict");
if (label_unicode == NULL) {
Py_DECREF(component_unicode);
Py_DECREF(result);
goto exit_destroy_response;
}
PyObject *tuple = Py_BuildValue("(OO)", component_unicode, label_unicode);
if (tuple == NULL) {
Py_DECREF(component_unicode);
Py_DECREF(label_unicode);
goto exit_destroy_response;
}
// Note: PyList_SetItem steals a reference, so don't worry about DECREF
PyList_SetItem(result, (Py_ssize_t)i, tuple);
Py_DECREF(component_unicode);
Py_DECREF(label_unicode);
}
exit_destroy_response:
address_parser_response_destroy(parsed);
exit_decref_country_str:
#ifndef IS_PY3K
if (str_country != Py_None) {
Py_XDECREF(str_country);
}
#endif
exit_decref_country_unistr:
if (unistr_country != Py_None) {
Py_XDECREF(unistr_country);
}
exit_decref_language_str:
#ifndef IS_PY3K
if (str_language != Py_None) {
Py_XDECREF(str_language);
}
#endif
exit_decref_language_unistr:
if (unistr_language != Py_None) {
Py_XDECREF(unistr_language);
}
exit_decref_input_str:
#ifndef IS_PY3K
Py_XDECREF(str_input);
#endif
exit_decref_input_unistr:
Py_XDECREF(unistr_input);
return result;
}
static PyMethodDef parser_methods[] = {
{"parse_address", (PyCFunction)py_parse_address, METH_VARARGS | METH_KEYWORDS, "parse_address(text, language, country)"},
{NULL, NULL},
};
#ifdef IS_PY3K
static int parser_traverse(PyObject *m, visitproc visit, void *arg) {
Py_VISIT(GETSTATE(m)->error);
return 0;
}
static int parser_clear(PyObject *m) {
Py_CLEAR(GETSTATE(m)->error);
libpostal_teardown();
libpostal_teardown_parser();
return 0;
}
static struct PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"_parser",
NULL,
sizeof(struct module_state),
parser_methods,
NULL,
parser_traverse,
parser_clear,
NULL
};
#define INITERROR return NULL
PyObject *
PyInit_parser(void) {
#else
#define INITERROR return
void cleanup_libpostal(void) {
libpostal_teardown();
libpostal_teardown_parser();
}
void
init_parser(void) {
#endif
#ifdef IS_PY3K
PyObject *module = PyModule_Create(&module_def);
#else
PyObject *module = Py_InitModule("_parser", parser_methods);
#endif
if (module == NULL) {
INITERROR;
}
struct module_state *st = GETSTATE(module);
st->error = PyErr_NewException("_parser.Error", NULL, NULL);
if (st->error == NULL) {
Py_DECREF(module);
INITERROR;
}
if (!libpostal_setup() || !libpostal_setup_parser()) {
PyErr_SetString(PyExc_TypeError,
"Error loading libpostal data");
}
#ifndef IS_PY3K
Py_AtExit(&cleanup_libpostal);
#endif
#ifdef IS_PY3K
return module;
#endif
}

View File

@@ -1,34 +0,0 @@
import six
text_type = six.text_type
string_types = six.string_types
binary_type = six.binary_type
def safe_decode(value, encoding='utf-8', errors='strict'):
if isinstance(value, text_type):
return value
if isinstance(value, (string_types, binary_type)):
return value.decode(encoding, errors)
else:
return binary_type(value).decode(encoding, errors)
def safe_encode(value, incoming=None, encoding='utf-8', errors='strict'):
if not isinstance(value, (string_types, binary_type)):
return binary_type(value)
if isinstance(value, text_type):
return value.encode(encoding, errors)
else:
if hasattr(incoming, 'lower'):
incoming = incoming.lower()
if hasattr(encoding, 'lower'):
encoding = encoding.lower()
if value and encoding != incoming:
value = safe_decode(value, encoding, errors)
return value.encode(encoding, errors)
else:
return value

View File

@@ -1,84 +0,0 @@
# -*- coding: utf-8 -*-
from postal.text import _normalize
from postal.text.tokenize import tokenize_raw
from postal.text.token_types import token_types
from postal.text.encoding import safe_decode
# String options
NORMALIZE_STRING_LATIN_ASCII = _normalize.NORMALIZE_STRING_LATIN_ASCII
NORMALIZE_STRING_TRANSLITERATE = _normalize.NORMALIZE_STRING_TRANSLITERATE
NORMALIZE_STRING_STRIP_ACCENTS = _normalize.NORMALIZE_STRING_STRIP_ACCENTS
NORMALIZE_STRING_DECOMPOSE = _normalize.NORMALIZE_STRING_DECOMPOSE
NORMALIZE_STRING_LOWERCASE = _normalize.NORMALIZE_STRING_LOWERCASE
NORMALIZE_STRING_TRIM = _normalize.NORMALIZE_STRING_TRIM
NORMALIZE_STRING_REPLACE_HYPHENS = _normalize.NORMALIZE_STRING_REPLACE_HYPHENS
DEFAULT_STRING_OPTIONS = NORMALIZE_STRING_LATIN_ASCII | \
NORMALIZE_STRING_DECOMPOSE | \
NORMALIZE_STRING_TRIM | \
NORMALIZE_STRING_REPLACE_HYPHENS | \
NORMALIZE_STRING_STRIP_ACCENTS | \
NORMALIZE_STRING_LOWERCASE
# Token options
NORMALIZE_TOKEN_REPLACE_HYPHENS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS
NORMALIZE_TOKEN_DELETE_HYPHENS = _normalize.NORMALIZE_TOKEN_DELETE_HYPHENS
NORMALIZE_TOKEN_DELETE_FINAL_PERIOD = _normalize.NORMALIZE_TOKEN_DELETE_FINAL_PERIOD
NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS = _normalize.NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS
NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES = _normalize.NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES
NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE = _normalize.NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE
NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC = _normalize.NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC
NORMALIZE_TOKEN_REPLACE_DIGITS = _normalize.NORMALIZE_TOKEN_REPLACE_DIGITS
DEFAULT_TOKEN_OPTIONS = NORMALIZE_TOKEN_REPLACE_HYPHENS | \
NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \
NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \
NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \
NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE | \
NORMALIZE_TOKEN_REPLACE_DIGITS
def remove_parens(tokens):
new_tokens = []
open_parens = 0
for t, c in tokens:
if c == token_types.PUNCT_OPEN:
open_parens += 1
elif c == token_types.PUNCT_CLOSE:
if open_parens > 0:
open_parens -= 1
elif open_parens <= 0:
new_tokens.append((t, c))
return new_tokens
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
token_options=DEFAULT_TOKEN_OPTIONS,
strip_parentheticals=True):
'''
Normalizes a string, tokenizes, and normalizes each token
with string and token-level options.
This version only uses libpostal's deterministic normalizations
i.e. methods with a single output. The string tree version will
return multiple normalized strings, each with tokens.
Usage:
normalized_tokens(u'St.-Barthélemy')
'''
s = safe_decode(s)
if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII:
normalized = _normalize.normalize_string_latin(s, string_options)
else:
normalized = _normalize.normalize_string_utf8(s, string_options)
# Tuples of (offset, len, type)
raw_tokens = tokenize_raw(normalized)
tokens = [(_normalize.normalize_token(normalized, t, token_options),
token_types.from_id(t[-1])) for t in raw_tokens]
if strip_parentheticals:
return remove_parens(tokens)
else:
return tokens

View File

@@ -1,328 +0,0 @@
#include <Python.h>
#include "src/normalize.h"
#include "src/transliterate.h"
#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif
struct module_state {
PyObject *error;
};
#ifdef IS_PY3K
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
#else
#define GETSTATE(m) (&_state)
static struct module_state _state;
#endif
static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args)
{
PyObject *arg1;
uint64_t options;
if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) {
return 0;
}
PyObject *unistr = PyUnicode_FromObject(arg1);
if (unistr == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be converted to unicode in scanner");
return 0;
}
#ifdef IS_PY3K
// Python 3 encoding, supported by Python 3.3+
char *input = PyUnicode_AsUTF8(unistr);
#else
// Python 2 encoding
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
if (str == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be utf-8 encoded");
goto exit_decref_unistr;
}
char *input = PyBytes_AsString(str);
#endif
if (input == NULL) {
goto exit_decref_str;
}
char *normalized = normalize_string_utf8(input, options);
if (normalized == NULL) {
goto exit_decref_str;
}
PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict");
free(normalized);
if (result == NULL) {
PyErr_SetString(PyExc_ValueError,
"Result could not be utf-8 decoded");
goto exit_decref_str;
}
#ifndef IS_PY3K
Py_XDECREF(str);
#endif
Py_XDECREF(unistr);
return result;
exit_decref_str:
#ifndef IS_PY3K
Py_XDECREF(str);
#endif
exit_decref_unistr:
Py_XDECREF(unistr);
return 0;
}
static PyObject *py_normalize_string_latin(PyObject *self, PyObject *args)
{
PyObject *arg1;
uint64_t options;
if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) {
return 0;
}
PyObject *unistr = PyUnicode_FromObject(arg1);
if (unistr == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be converted to unicode in scanner");
return 0;
}
#ifdef IS_PY3K
// Python 3 encoding, supported by Python 3.3+
char *input = PyUnicode_AsUTF8(unistr);
#else
// Python 2 encoding
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
if (str == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be utf-8 encoded");
goto exit_decref_unistr;
}
char *input = PyBytes_AsString(str);
#endif
if (input == NULL) {
goto exit_decref_str;
}
char *normalized = normalize_string_latin(input, strlen(input), options);
PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict");
free(normalized);
if (result == NULL) {
PyErr_SetString(PyExc_ValueError,
"Result could not be utf-8 decoded");
goto exit_decref_str;
}
#ifndef IS_PY3K
Py_XDECREF(str);
#endif
Py_XDECREF(unistr);
return result;
exit_decref_str:
#ifndef IS_PY3K
Py_XDECREF(str);
#endif
exit_decref_unistr:
Py_XDECREF(unistr);
return 0;
}
static PyObject *py_normalize_token(PyObject *self, PyObject *args)
{
PyObject *s;
uint32_t offset;
uint32_t len;
uint16_t type;
uint64_t options;
if (!PyArg_ParseTuple(args, "O(IIH)K:normalize", &s, &offset, &len, &type, &options)) {
PyErr_SetString(PyExc_TypeError,
"Error parsing arguments");
return 0;
}
token_t token = (token_t){(size_t)offset, (size_t)len, type};
PyObject *unistr = PyUnicode_FromObject(s);
if (unistr == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be converted to unicode in scanner");
return 0;
}
#ifdef IS_PY3K
// Python 3 encoding, supported by Python 3.3+
char *input = PyUnicode_AsUTF8(unistr);
#else
// Python 2 encoding
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
if (str == NULL) {
PyErr_SetString(PyExc_ValueError,
"Parameter could not be utf-8 encoded");
goto exit_decref_unistr;
}
char *input = PyBytes_AsString(str);
#endif
if (input == NULL) {
goto exit_decref_str;
}
char_array *token_buffer = char_array_new_size(token.len);
add_normalized_token(token_buffer, input, token, options);
char *token_str = char_array_get_string(token_buffer);
PyObject *result = PyUnicode_DecodeUTF8((const char *)token_str, token_buffer->n - 1, "strict");
if (result == NULL) {
PyErr_SetString(PyExc_ValueError,
"Error decoding token");
char_array_destroy(token_buffer);
goto exit_decref_str;
}
char_array_destroy(token_buffer);
#ifndef IS_PY3K
Py_XDECREF(str);
#endif
Py_XDECREF(unistr);
return result;
exit_decref_str:
#ifndef IS_PY3K
Py_XDECREF(str);
#endif
exit_decref_unistr:
Py_XDECREF(unistr);
return 0;
}
static PyMethodDef normalize_methods[] = {
{"normalize_string_utf8", (PyCFunction)py_normalize_string_utf8, METH_VARARGS, "normalize_string_utf8(input, options)"},
{"normalize_string_latin", (PyCFunction)py_normalize_string_latin, METH_VARARGS, "normalize_string_latin(input, options)"},
{"normalize_token", (PyCFunction)py_normalize_token, METH_VARARGS, "normalize_token(input, options)"},
{NULL, NULL},
};
#ifdef IS_PY3K
static int normalize_traverse(PyObject *m, visitproc visit, void *arg) {
Py_VISIT(GETSTATE(m)->error);
return 0;
}
static int normalize_clear(PyObject *m) {
Py_CLEAR(GETSTATE(m)->error);
return 0;
}
static struct PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"_normalize",
NULL,
sizeof(struct module_state),
normalize_methods,
NULL,
normalize_traverse,
normalize_clear,
NULL
};
#define INITERROR return NULL
PyObject *
PyInit_normalize(void) {
#else
#define INITERROR return
void
init_normalize(void) {
#endif
#ifdef IS_PY3K
PyObject *module = PyModule_Create(&module_def);
#else
PyObject *module = Py_InitModule("_normalize", normalize_methods);
#endif
if (module == NULL)
INITERROR;
struct module_state *st = GETSTATE(module);
st->error = PyErr_NewException("_normalize.Error", NULL, NULL);
if (st->error == NULL) {
Py_DECREF(module);
INITERROR;
}
if (!transliteration_module_setup(NULL)) {
PyErr_SetString(PyExc_RuntimeError,
"Could not load transliterate module");
Py_DECREF(module);
INITERROR;
}
PyModule_AddObject(module, "NORMALIZE_STRING_LATIN_ASCII", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LATIN_ASCII));
PyModule_AddObject(module, "NORMALIZE_STRING_TRANSLITERATE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRANSLITERATE));
PyModule_AddObject(module, "NORMALIZE_STRING_STRIP_ACCENTS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_STRIP_ACCENTS));
PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_DECOMPOSE));
PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LOWERCASE));
PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRIM));
PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_REPLACE_HYPHENS));
PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_HYPHENS));
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_HYPHENS));
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_FINAL_PERIOD", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_FINAL_PERIOD));
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS));
PyModule_AddObject(module, "NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES));
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE));
PyModule_AddObject(module, "NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC));
PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_DIGITS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_DIGITS));
#if PY_MAJOR_VERSION >= 3
return module;
#endif
}

View File

@@ -1,164 +0,0 @@
#include <Python.h>
#include "src/scanner.h"
#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif
struct module_state {
PyObject *error;
};
#ifdef IS_PY3K
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
#else
#define GETSTATE(m) (&_state)
static struct module_state _state;
#endif
static PyObject *py_tokenize(PyObject *self, PyObject *args)
{
PyObject *arg1;
if (!PyArg_ParseTuple(args, "O:tokenize", &arg1)) {
return 0;
}
PyObject *unistr = PyUnicode_FromObject(arg1);
if (unistr == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be converted to unicode in scanner");
return 0;
}
#ifdef IS_PY3K
// Python 3 encoding, supported by Python 3.3+
char *input = PyUnicode_AsUTF8(unistr);
#else
// Python 2 encoding
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
if (str == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be utf-8 encoded");
goto error_decref_unistr;
}
char *input = PyBytes_AsString(str);
#endif
if (input == NULL) {
goto error_decref_str;
}
token_array *tokens = tokenize(input);
if (tokens == NULL) {
goto error_decref_str;
}
PyObject *result = PyTuple_New(tokens->n);
if (!result) {
token_array_destroy(tokens);
goto error_decref_str;
return 0;
}
PyObject *tuple;
token_t token;
for (size_t i = 0; i < tokens->n; i++) {
token = tokens->a[i];
tuple = Py_BuildValue("III", token.offset, token.len, token.type);
if (PyTuple_SetItem(result, i, tuple) < 0) {
token_array_destroy(tokens);
goto error_decref_str;
}
}
#ifndef IS_PY3K
Py_XDECREF(str);
#endif
Py_XDECREF(unistr);
token_array_destroy(tokens);
return result;
error_decref_str:
#ifndef IS_PY3K
Py_XDECREF(str);
#endif
error_decref_unistr:
Py_XDECREF(unistr);
return 0;
}
static PyMethodDef tokenize_methods[] = {
{"tokenize", (PyCFunction)py_tokenize, METH_VARARGS, "tokenize(text)"},
{NULL, NULL},
};
#ifdef IS_PY3K
static int tokenize_traverse(PyObject *m, visitproc visit, void *arg) {
Py_VISIT(GETSTATE(m)->error);
return 0;
}
static int tokenize_clear(PyObject *m) {
Py_CLEAR(GETSTATE(m)->error);
return 0;
}
static struct PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"_tokenize",
NULL,
sizeof(struct module_state),
tokenize_methods,
NULL,
tokenize_traverse,
tokenize_clear,
NULL
};
#define INITERROR return NULL
PyObject *
PyInit_tokenize(void) {
#else
#define INITERROR return
void
init_tokenize(void) {
#endif
#ifdef IS_PY3K
PyObject *module = PyModule_Create(&module_def);
#else
PyObject *module = Py_InitModule("_tokenize", tokenize_methods);
#endif
if (module == NULL)
INITERROR;
struct module_state *st = GETSTATE(module);
st->error = PyErr_NewException("_tokenize.Error", NULL, NULL);
if (st->error == NULL) {
Py_DECREF(module);
INITERROR;
}
#if PY_MAJOR_VERSION >= 3
return module;
#endif
}

View File

@@ -1,68 +0,0 @@
from postal.utils.enum import Enum, EnumValue
class token_types(Enum):
# Word types
WORD = EnumValue(1)
ABBREVIATION = EnumValue(2)
IDEOGRAPHIC_CHAR = EnumValue(3)
HANGUL_SYLLABLE = EnumValue(4)
ACRONYM = EnumValue(5)
# Special tokens
EMAIL = EnumValue(20)
URL = EnumValue(21)
US_PHONE = EnumValue(22)
INTL_PHONE = EnumValue(23)
# Numbers and numeric types
NUMERIC = EnumValue(50)
ORDINAL = EnumValue(51)
ROMAN_NUMERAL = EnumValue(52)
IDEOGRAPHIC_NUMBER = EnumValue(53)
# Punctuation types, may separate a phrase
PERIOD = EnumValue(100)
EXCLAMATION = EnumValue(101)
QUESTION_MARK = EnumValue(102)
COMMA = EnumValue(103)
COLON = EnumValue(104)
SEMICOLON = EnumValue(105)
PLUS = EnumValue(106)
AMPERSAND = EnumValue(107)
AT_SIGN = EnumValue(108)
POUND = EnumValue(109)
ELLIPSIS = EnumValue(110)
DASH = EnumValue(111)
BREAKING_DASH = EnumValue(112)
HYPHEN = EnumValue(113)
PUNCT_OPEN = EnumValue(114)
PUNCT_CLOSE = EnumValue(115)
DOUBLE_QUOTE = EnumValue(119)
SINGLE_QUOTE = EnumValue(120)
OPEN_QUOTE = EnumValue(121)
CLOSE_QUOTE = EnumValue(122)
SLASH = EnumValue(124)
BACKSLASH = EnumValue(125)
GREATER_THAN = EnumValue(126)
LESS_THAN = EnumValue(127)
# Non-letters and whitespace
OTHER = EnumValue(200)
WHITESPACE = EnumValue(300)
NEWLINE = EnumValue(301)
WORD_TOKEN_TYPES = set([
WORD,
ABBREVIATION,
IDEOGRAPHIC_CHAR,
HANGUL_SYLLABLE,
ACRONYM
])
NUMERIC_TOKEN_TYPES = set([
NUMERIC,
ORDINAL,
ROMAN_NUMERAL,
IDEOGRAPHIC_NUMBER,
])

View File

@@ -1,14 +0,0 @@
from postal.text.encoding import safe_encode, safe_decode
from postal.text import _tokenize
from postal.text.token_types import token_types
def tokenize_raw(s):
return _tokenize.tokenize(safe_decode(s))
def tokenize(s):
u = safe_decode(s)
s = safe_encode(s)
return [(safe_decode(s[start:start + length]), token_types.from_id(token_type))
for start, length, token_type in _tokenize.tokenize(u)]

View File

@@ -1,62 +0,0 @@
class EnumValue(object):
def __init__(self, value, name=None):
self.value = value
self.name = name
def __hash__(self):
return self.value
def __cmp__(self, other):
if isinstance(other, EnumValue):
return self.value.__cmp__(other.value)
else:
return self.value.__cmp__(other)
def __unicode__(self):
return self.name
def __str__(self):
return self.name
def __repr__(self):
return self.name
class EnumMeta(type):
def __init__(self, name, bases, dict_):
self.registry = self.registry.copy()
self.name_registry = self.name_registry.copy()
for k, v in dict_.iteritems():
if isinstance(v, EnumValue) and v not in self.registry:
if v.name is None:
v.name = k
self.registry[v.value] = v
self.name_registry[v.name] = v
return super(EnumMeta, self).__init__(name, bases, dict_)
def __iter__(self):
return self.registry.itervalues()
def __getitem__(self, key):
return self.registry[key]
class Enum(object):
__metaclass__ = EnumMeta
registry = {}
name_registry = {}
@classmethod
def from_id(cls, value):
try:
return cls.registry[value]
except KeyError:
raise ValueError('Invalid value for {}: {}'.format(cls.__name__, value))
@classmethod
def from_string(cls, name):
try:
return cls.name_registry[name]
except KeyError:
raise ValueError('Invalid name for {}: {}'.format(cls.__name__, name))

View File

@@ -1,78 +0,0 @@
import argparse
import os
import subprocess
import sys
from setuptools import setup, Extension, Command, find_packages
from setuptools.command.build_py import build_py
from setuptools.command.build_ext import build_ext
from setuptools.command.install import install
from distutils.errors import DistutilsArgError
SRC_DIR = 'src'
this_dir = os.path.realpath(os.path.dirname(__file__))
def main():
setup(
name='pypostal',
version='0.2',
install_requires=[
'six',
],
ext_modules=[
Extension('postal.text._tokenize',
sources=[os.path.join(SRC_DIR, f)
for f in ('scanner.c',
'string_utils.c',
'tokens.c',
'utf8proc/utf8proc.c',
)
] + ['python/postal/text/pytokenize.c'],
include_dirs=[this_dir],
extra_compile_args=['-O0', '-std=c99',
'-Wno-unused-function'],
),
Extension('postal.text._normalize',
sources=[os.path.join(SRC_DIR, f)
for f in ('normalize.c',
'string_utils.c',
'utf8proc/utf8proc.c',
'tokens.c',
'unicode_scripts.c',
'transliterate.c',
'file_utils.c',
'trie.c',
'trie_search.c',)
] + ['python/postal/text/pynormalize.c'],
include_dirs=[this_dir],
extra_compile_args=['-std=c99', '-DHAVE_CONFIG_H',
'-Wno-unused-function'],
),
Extension('postal._expand',
sources=['python/postal/pyexpand.c'],
libraries=['postal'],
extra_compile_args=['-std=c99',
'-Wno-unused-function'],
),
Extension('postal._parser',
sources=['python/postal/pyparser.c'],
libraries=['postal'],
extra_compile_args=['-std=c99',
'-Wno-unused-function'],
),
],
packages=find_packages('python'),
package_dir={'': 'python'},
include_package_data=True,
zip_safe=False,
url='http://mapzen.com',
description='Fast address standardization and deduplication',
license='MIT License',
maintainer='mapzen.com',
maintainer_email='pelias@mapzen.com'
)
if __name__ == '__main__':
main()