Initial fork commit
This commit is contained in:
0
scripts/geodata/text/__init__.py
Normal file
0
scripts/geodata/text/__init__.py
Normal file
76
scripts/geodata/text/normalize.py
Normal file
76
scripts/geodata/text/normalize.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import six
|
||||
|
||||
from geodata.text import _normalize
|
||||
from geodata.text.token_types import token_types
|
||||
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
# String options
|
||||
NORMALIZE_STRING_LATIN_ASCII = _normalize.NORMALIZE_STRING_LATIN_ASCII
|
||||
NORMALIZE_STRING_TRANSLITERATE = _normalize.NORMALIZE_STRING_TRANSLITERATE
|
||||
NORMALIZE_STRING_STRIP_ACCENTS = _normalize.NORMALIZE_STRING_STRIP_ACCENTS
|
||||
NORMALIZE_STRING_DECOMPOSE = _normalize.NORMALIZE_STRING_DECOMPOSE
|
||||
NORMALIZE_STRING_LOWERCASE = _normalize.NORMALIZE_STRING_LOWERCASE
|
||||
NORMALIZE_STRING_TRIM = _normalize.NORMALIZE_STRING_TRIM
|
||||
NORMALIZE_STRING_REPLACE_HYPHENS = _normalize.NORMALIZE_STRING_REPLACE_HYPHENS
|
||||
NORMALIZE_STRING_SIMPLE_LATIN_ASCII = _normalize.NORMALIZE_STRING_SIMPLE_LATIN_ASCII
|
||||
|
||||
DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_DEFAULT_STRING_OPTIONS
|
||||
|
||||
# Token options
|
||||
NORMALIZE_TOKEN_REPLACE_HYPHENS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS
|
||||
NORMALIZE_TOKEN_DELETE_HYPHENS = _normalize.NORMALIZE_TOKEN_DELETE_HYPHENS
|
||||
NORMALIZE_TOKEN_DELETE_FINAL_PERIOD = _normalize.NORMALIZE_TOKEN_DELETE_FINAL_PERIOD
|
||||
NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS = _normalize.NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS
|
||||
NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES = _normalize.NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES
|
||||
NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE = _normalize.NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE
|
||||
NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC = _normalize.NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC
|
||||
NORMALIZE_TOKEN_REPLACE_DIGITS = _normalize.NORMALIZE_TOKEN_REPLACE_DIGITS
|
||||
|
||||
DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_DEFAULT_TOKEN_OPTIONS
|
||||
|
||||
TOKEN_OPTIONS_DROP_PERIODS = _normalize.NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS
|
||||
DEFAULT_TOKEN_OPTIONS_NUMERIC = _normalize.NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC
|
||||
|
||||
|
||||
def remove_parens(tokens):
|
||||
new_tokens = []
|
||||
open_parens = 0
|
||||
for t, c in tokens:
|
||||
if c == token_types.PUNCT_OPEN:
|
||||
open_parens += 1
|
||||
elif c == token_types.PUNCT_CLOSE:
|
||||
if open_parens > 0:
|
||||
open_parens -= 1
|
||||
elif open_parens <= 0:
|
||||
new_tokens.append((t, c))
|
||||
return new_tokens
|
||||
|
||||
|
||||
def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS):
|
||||
s = safe_decode(s)
|
||||
return _normalize.normalize_string(s, string_options)
|
||||
|
||||
|
||||
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
|
||||
token_options=DEFAULT_TOKEN_OPTIONS,
|
||||
strip_parentheticals=True, whitespace=False):
|
||||
'''
|
||||
Normalizes a string, tokenizes, and normalizes each token
|
||||
with string and token-level options.
|
||||
|
||||
This version only uses libpostal's deterministic normalizations
|
||||
i.e. methods with a single output. The string tree version will
|
||||
return multiple normalized strings, each with tokens.
|
||||
|
||||
Usage:
|
||||
normalized_tokens(u'St.-Barthélemy')
|
||||
'''
|
||||
s = safe_decode(s)
|
||||
normalized_tokens = _normalize.normalized_tokens(s, string_options, token_options, whitespace)
|
||||
|
||||
if strip_parentheticals:
|
||||
normalized_tokens = remove_parens(normalized_tokens)
|
||||
|
||||
return [(s, token_types.from_id(token_type)) for s, token_type in normalized_tokens]
|
||||
75
scripts/geodata/text/phrases.py
Normal file
75
scripts/geodata/text/phrases.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import six
|
||||
|
||||
from collections import *
|
||||
from marisa_trie import BytesTrie
|
||||
from geodata.encoding import safe_encode, safe_decode
|
||||
|
||||
SENTINEL = None
|
||||
|
||||
|
||||
class PhraseFilter(object):
|
||||
def __init__(self, phrases):
|
||||
if hasattr(phrases, 'items'):
|
||||
phrases = six.iteritems(phrases)
|
||||
vals = [(safe_decode(key), self.serialize(val)) for key, val in phrases]
|
||||
self.trie = BytesTrie(vals)
|
||||
|
||||
serialize = staticmethod(safe_encode)
|
||||
deserialize = staticmethod(safe_decode)
|
||||
|
||||
def filter(self, tokens):
|
||||
def return_item(item):
|
||||
return False, item, []
|
||||
|
||||
if not tokens:
|
||||
return
|
||||
|
||||
ent = []
|
||||
ent_tokens = []
|
||||
|
||||
queue = deque(tokens + [(SENTINEL,) * 2])
|
||||
skip_until = 0
|
||||
|
||||
trie = self.trie
|
||||
|
||||
while queue:
|
||||
item = queue.popleft()
|
||||
t, c = item
|
||||
|
||||
if t is not SENTINEL and trie.has_keys_with_prefix(u' '.join(ent_tokens + [t])):
|
||||
ent.append(item)
|
||||
ent_tokens.append(item[0])
|
||||
elif ent_tokens:
|
||||
res = trie.get(u' '.join(ent_tokens)) or None
|
||||
if res is not None:
|
||||
yield (True, ent, map(self.deserialize, res))
|
||||
queue.appendleft(item)
|
||||
ent = []
|
||||
ent_tokens = []
|
||||
elif len(ent_tokens) == 1:
|
||||
yield return_item(ent[0])
|
||||
ent = []
|
||||
ent_tokens = []
|
||||
queue.appendleft(item)
|
||||
else:
|
||||
have_phrase = False
|
||||
|
||||
for i in xrange(len(ent) - 1, 0, -1):
|
||||
remainder = ent[i:]
|
||||
res = trie.get(u' '.join([e[0] for e in ent[:i]])) or None
|
||||
if res is not None:
|
||||
yield (True, ent[:i], map(self.deserialize, res))
|
||||
have_phrase = True
|
||||
break
|
||||
|
||||
if not have_phrase:
|
||||
yield return_item(ent[0])
|
||||
|
||||
todos = list(remainder)
|
||||
todos.append(item)
|
||||
queue.extendleft(reversed(todos))
|
||||
|
||||
ent = []
|
||||
ent_tokens = []
|
||||
elif t is not SENTINEL:
|
||||
yield return_item(item)
|
||||
291
scripts/geodata/text/pynormalize.c
Normal file
291
scripts/geodata/text/pynormalize.c
Normal file
@@ -0,0 +1,291 @@
|
||||
#include <Python.h>
|
||||
|
||||
#include <libpostal/libpostal.h>
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
#define IS_PY3K
|
||||
#endif
|
||||
|
||||
struct module_state {
|
||||
PyObject *error;
|
||||
};
|
||||
|
||||
|
||||
#ifdef IS_PY3K
|
||||
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
|
||||
#else
|
||||
#define GETSTATE(m) (&_state)
|
||||
static struct module_state _state;
|
||||
#endif
|
||||
|
||||
static PyObject *py_normalize_string(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *arg1;
|
||||
uint64_t options;
|
||||
if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyObject *unistr = PyUnicode_FromObject(arg1);
|
||||
if (unistr == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Parameter could not be converted to unicode in scanner");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef IS_PY3K
|
||||
// Python 3 encoding, supported by Python 3.3+
|
||||
|
||||
char *input = PyUnicode_AsUTF8(unistr);
|
||||
|
||||
#else
|
||||
// Python 2 encoding
|
||||
|
||||
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
|
||||
if (str == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Parameter could not be utf-8 encoded");
|
||||
goto exit_normalize_decref_unistr;
|
||||
}
|
||||
|
||||
char *input = PyBytes_AsString(str);
|
||||
|
||||
#endif
|
||||
|
||||
if (input == NULL) {
|
||||
goto exit_normalize_decref_str;
|
||||
}
|
||||
|
||||
char *normalized = libpostal_normalize_string(input, options);
|
||||
|
||||
if (normalized == NULL) {
|
||||
goto exit_normalize_decref_str;
|
||||
}
|
||||
|
||||
PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict");
|
||||
free(normalized);
|
||||
if (result == NULL) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"Result could not be utf-8 decoded");
|
||||
goto exit_normalize_decref_str;
|
||||
}
|
||||
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str);
|
||||
#endif
|
||||
Py_XDECREF(unistr);
|
||||
|
||||
return result;
|
||||
|
||||
exit_normalize_decref_str:
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str);
|
||||
#endif
|
||||
exit_normalize_decref_unistr:
|
||||
Py_XDECREF(unistr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static PyObject *py_normalized_tokens(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *arg1;
|
||||
uint64_t string_options = LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS;
|
||||
uint64_t token_options = LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS;
|
||||
uint32_t arg_whitespace = 0;
|
||||
|
||||
PyObject *result = NULL;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O|KKI:normalize", &arg1, &string_options, &token_options, &arg_whitespace)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyObject *unistr = PyUnicode_FromObject(arg1);
|
||||
if (unistr == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Parameter could not be converted to unicode in scanner");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef IS_PY3K
|
||||
// Python 3 encoding, supported by Python 3.3+
|
||||
|
||||
char *input = PyUnicode_AsUTF8(unistr);
|
||||
|
||||
#else
|
||||
// Python 2 encoding
|
||||
|
||||
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
|
||||
if (str == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Parameter could not be utf-8 encoded");
|
||||
goto exit_normalized_tokens_decref_str;
|
||||
}
|
||||
|
||||
char *input = PyBytes_AsString(str);
|
||||
|
||||
#endif
|
||||
|
||||
if (input == NULL) {
|
||||
goto exit_normalized_tokens_decref_str;
|
||||
}
|
||||
|
||||
bool whitespace = arg_whitespace;
|
||||
|
||||
size_t num_tokens;
|
||||
libpostal_normalized_token_t *normalized_tokens = libpostal_normalized_tokens(input, string_options, token_options, whitespace, &num_tokens);
|
||||
|
||||
if (normalized_tokens == NULL) {
|
||||
goto exit_normalized_tokens_decref_str;
|
||||
}
|
||||
|
||||
result = PyList_New((Py_ssize_t)num_tokens);
|
||||
if (!result) {
|
||||
goto exit_free_normalized_tokens;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_tokens; i++) {
|
||||
libpostal_normalized_token_t normalized_token = normalized_tokens[i];
|
||||
char *token_str = normalized_token.str;
|
||||
PyObject *py_token = PyUnicode_DecodeUTF8((const char *)token_str, strlen(token_str), "strict");
|
||||
if (py_token == NULL) {
|
||||
Py_DECREF(result);
|
||||
goto exit_free_normalized_tokens;
|
||||
}
|
||||
|
||||
PyObject *t = PyTuple_New(2);
|
||||
PyObject *py_token_type = PyInt_FromLong(normalized_token.token.type);
|
||||
|
||||
PyTuple_SetItem(t, 0, py_token);
|
||||
PyTuple_SetItem(t, 1, py_token_type);
|
||||
|
||||
// Note: PyList_SetItem steals a reference, so don't worry about DECREF
|
||||
PyList_SetItem(result, (Py_ssize_t)i, t);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_tokens; i++) {
|
||||
free(normalized_tokens[i].str);
|
||||
}
|
||||
free(normalized_tokens);
|
||||
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str);
|
||||
#endif
|
||||
Py_XDECREF(unistr);
|
||||
|
||||
return result;
|
||||
exit_free_normalized_tokens:
|
||||
for (size_t i = 0; i < num_tokens; i++) {
|
||||
free(normalized_tokens[i].str);
|
||||
}
|
||||
free(normalized_tokens);
|
||||
exit_normalized_tokens_decref_str:
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str);
|
||||
#endif
|
||||
exit_normalized_tokens_decref_unistr:
|
||||
Py_XDECREF(unistr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static PyMethodDef normalize_methods[] = {
|
||||
{"normalize_string", (PyCFunction)py_normalize_string, METH_VARARGS, "normalize_string(input, options)"},
|
||||
{"normalized_tokens", (PyCFunction)py_normalized_tokens, METH_VARARGS, "normalize_token(input, string_options, token_options, whitespace)"},
|
||||
{NULL, NULL},
|
||||
};
|
||||
|
||||
|
||||
|
||||
#ifdef IS_PY3K
|
||||
|
||||
static int normalize_traverse(PyObject *m, visitproc visit, void *arg) {
|
||||
Py_VISIT(GETSTATE(m)->error);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int normalize_clear(PyObject *m) {
|
||||
Py_CLEAR(GETSTATE(m)->error);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static struct PyModuleDef module_def = {
|
||||
PyModuleDef_HEAD_INIT,
|
||||
"_normalize",
|
||||
NULL,
|
||||
sizeof(struct module_state),
|
||||
normalize_methods,
|
||||
NULL,
|
||||
normalize_traverse,
|
||||
normalize_clear,
|
||||
NULL
|
||||
};
|
||||
|
||||
#define INITERROR return NULL
|
||||
|
||||
PyObject *
|
||||
PyInit_normalize(void) {
|
||||
#else
|
||||
#define INITERROR return
|
||||
|
||||
void
|
||||
init_normalize(void) {
|
||||
#endif
|
||||
|
||||
#ifdef IS_PY3K
|
||||
PyObject *module = PyModule_Create(&module_def);
|
||||
#else
|
||||
PyObject *module = Py_InitModule("_normalize", normalize_methods);
|
||||
#endif
|
||||
|
||||
if (module == NULL)
|
||||
INITERROR;
|
||||
struct module_state *st = GETSTATE(module);
|
||||
|
||||
st->error = PyErr_NewException("_normalize.Error", NULL, NULL);
|
||||
if (st->error == NULL) {
|
||||
Py_DECREF(module);
|
||||
INITERROR;
|
||||
}
|
||||
|
||||
if (!libpostal_setup()) {
|
||||
PyErr_SetString(PyExc_RuntimeError,
|
||||
"Could not load libpostal");
|
||||
Py_DECREF(module);
|
||||
INITERROR;
|
||||
}
|
||||
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_LATIN_ASCII", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_TRANSLITERATE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_STRIP_ACCENTS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_COMPOSE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_COMPOSE));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_LOWERCASE));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_TRIM));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS));
|
||||
PyModule_AddObject(module, "NORMALIZE_STRING_SIMPLE_LATIN_ASCII", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII));
|
||||
|
||||
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS));
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS));
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_FINAL_PERIOD", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD));
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS));
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES));
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE));
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC));
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_DIGITS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS));
|
||||
|
||||
|
||||
PyModule_AddObject(module, "NORMALIZE_DEFAULT_STRING_OPTIONS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS));
|
||||
PyModule_AddObject(module, "NORMALIZE_DEFAULT_TOKEN_OPTIONS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS));
|
||||
|
||||
PyModule_AddObject(module, "NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS));
|
||||
|
||||
PyModule_AddObject(module, "NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC));
|
||||
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
return module;
|
||||
#endif
|
||||
}
|
||||
167
scripts/geodata/text/pytokenize.c
Normal file
167
scripts/geodata/text/pytokenize.c
Normal file
@@ -0,0 +1,167 @@
|
||||
#include <Python.h>
|
||||
|
||||
#include <libpostal/libpostal.h>
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
#define IS_PY3K
|
||||
#endif
|
||||
|
||||
struct module_state {
|
||||
PyObject *error;
|
||||
};
|
||||
|
||||
|
||||
#ifdef IS_PY3K
|
||||
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
|
||||
#else
|
||||
#define GETSTATE(m) (&_state)
|
||||
static struct module_state _state;
|
||||
#endif
|
||||
|
||||
static PyObject *py_tokenize(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *arg1;
|
||||
uint32_t arg_whitespace = 0;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "OI:tokenize", &arg1, &arg_whitespace)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool whitespace = arg_whitespace;
|
||||
|
||||
PyObject *unistr = PyUnicode_FromObject(arg1);
|
||||
if (unistr == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Parameter could not be converted to unicode in scanner");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef IS_PY3K
|
||||
// Python 3 encoding, supported by Python 3.3+
|
||||
|
||||
char *input = PyUnicode_AsUTF8(unistr);
|
||||
|
||||
#else
|
||||
// Python 2 encoding
|
||||
|
||||
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
|
||||
if (str == NULL) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"Parameter could not be utf-8 encoded");
|
||||
goto error_decref_unistr;
|
||||
}
|
||||
|
||||
char *input = PyBytes_AsString(str);
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
if (input == NULL) {
|
||||
goto error_decref_str;
|
||||
}
|
||||
|
||||
size_t num_tokens;
|
||||
|
||||
libpostal_token_t *tokens = libpostal_tokenize(input, whitespace, &num_tokens);
|
||||
if (tokens == NULL) {
|
||||
goto error_decref_str;
|
||||
}
|
||||
|
||||
PyObject *result = PyTuple_New(num_tokens);
|
||||
if (!result) {
|
||||
free(tokens);
|
||||
goto error_decref_str;
|
||||
return 0;
|
||||
}
|
||||
|
||||
PyObject *tuple;
|
||||
|
||||
libpostal_token_t token;
|
||||
for (size_t i = 0; i < num_tokens; i++) {
|
||||
token = tokens[i];
|
||||
tuple = Py_BuildValue("III", token.offset, token.len, token.type);
|
||||
if (PyTuple_SetItem(result, i, tuple) < 0) {
|
||||
free(tokens);
|
||||
goto error_decref_str;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str);
|
||||
#endif
|
||||
Py_XDECREF(unistr);
|
||||
|
||||
free(tokens);
|
||||
|
||||
return result;
|
||||
|
||||
error_decref_str:
|
||||
#ifndef IS_PY3K
|
||||
Py_XDECREF(str);
|
||||
#endif
|
||||
error_decref_unistr:
|
||||
Py_XDECREF(unistr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static PyMethodDef tokenize_methods[] = {
|
||||
{"tokenize", (PyCFunction)py_tokenize, METH_VARARGS, "tokenize(text, whitespace)"},
|
||||
{NULL, NULL},
|
||||
};
|
||||
|
||||
#ifdef IS_PY3K
|
||||
|
||||
static int tokenize_traverse(PyObject *m, visitproc visit, void *arg) {
|
||||
Py_VISIT(GETSTATE(m)->error);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tokenize_clear(PyObject *m) {
|
||||
Py_CLEAR(GETSTATE(m)->error);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static struct PyModuleDef module_def = {
|
||||
PyModuleDef_HEAD_INIT,
|
||||
"_tokenize",
|
||||
NULL,
|
||||
sizeof(struct module_state),
|
||||
tokenize_methods,
|
||||
NULL,
|
||||
tokenize_traverse,
|
||||
tokenize_clear,
|
||||
NULL
|
||||
};
|
||||
|
||||
#define INITERROR return NULL
|
||||
|
||||
PyObject *
|
||||
PyInit_tokenize(void) {
|
||||
#else
|
||||
#define INITERROR return
|
||||
|
||||
void
|
||||
init_tokenize(void) {
|
||||
#endif
|
||||
|
||||
#ifdef IS_PY3K
|
||||
PyObject *module = PyModule_Create(&module_def);
|
||||
#else
|
||||
PyObject *module = Py_InitModule("_tokenize", tokenize_methods);
|
||||
#endif
|
||||
|
||||
if (module == NULL)
|
||||
INITERROR;
|
||||
struct module_state *st = GETSTATE(module);
|
||||
|
||||
st->error = PyErr_NewException("_tokenize.Error", NULL, NULL);
|
||||
if (st->error == NULL) {
|
||||
Py_DECREF(module);
|
||||
INITERROR;
|
||||
}
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
return module;
|
||||
#endif
|
||||
}
|
||||
104
scripts/geodata/text/token_types.py
Normal file
104
scripts/geodata/text/token_types.py
Normal file
@@ -0,0 +1,104 @@
|
||||
from geodata.enum import Enum, EnumValue
|
||||
|
||||
|
||||
class token_types(Enum):
|
||||
# Word types
|
||||
WORD = EnumValue(1)
|
||||
ABBREVIATION = EnumValue(2)
|
||||
IDEOGRAPHIC_CHAR = EnumValue(3)
|
||||
HANGUL_SYLLABLE = EnumValue(4)
|
||||
ACRONYM = EnumValue(5)
|
||||
|
||||
# Special tokens
|
||||
EMAIL = EnumValue(20)
|
||||
URL = EnumValue(21)
|
||||
US_PHONE = EnumValue(22)
|
||||
INTL_PHONE = EnumValue(23)
|
||||
|
||||
# Numbers and numeric types
|
||||
NUMERIC = EnumValue(50)
|
||||
ORDINAL = EnumValue(51)
|
||||
ROMAN_NUMERAL = EnumValue(52)
|
||||
IDEOGRAPHIC_NUMBER = EnumValue(53)
|
||||
|
||||
# Punctuation types, may separate a phrase
|
||||
PERIOD = EnumValue(100)
|
||||
EXCLAMATION = EnumValue(101)
|
||||
QUESTION_MARK = EnumValue(102)
|
||||
COMMA = EnumValue(103)
|
||||
COLON = EnumValue(104)
|
||||
SEMICOLON = EnumValue(105)
|
||||
PLUS = EnumValue(106)
|
||||
AMPERSAND = EnumValue(107)
|
||||
AT_SIGN = EnumValue(108)
|
||||
POUND = EnumValue(109)
|
||||
ELLIPSIS = EnumValue(110)
|
||||
DASH = EnumValue(111)
|
||||
BREAKING_DASH = EnumValue(112)
|
||||
HYPHEN = EnumValue(113)
|
||||
PUNCT_OPEN = EnumValue(114)
|
||||
PUNCT_CLOSE = EnumValue(115)
|
||||
DOUBLE_QUOTE = EnumValue(119)
|
||||
SINGLE_QUOTE = EnumValue(120)
|
||||
OPEN_QUOTE = EnumValue(121)
|
||||
CLOSE_QUOTE = EnumValue(122)
|
||||
SLASH = EnumValue(124)
|
||||
BACKSLASH = EnumValue(125)
|
||||
GREATER_THAN = EnumValue(126)
|
||||
LESS_THAN = EnumValue(127)
|
||||
|
||||
# Non-letters and whitespace
|
||||
OTHER = EnumValue(200)
|
||||
WHITESPACE = EnumValue(300)
|
||||
NEWLINE = EnumValue(301)
|
||||
|
||||
# Phrase, special application-level type not returned by the tokenizer
|
||||
PHRASE = EnumValue(999)
|
||||
|
||||
WORD_TOKEN_TYPES = set([
|
||||
WORD,
|
||||
ABBREVIATION,
|
||||
IDEOGRAPHIC_CHAR,
|
||||
HANGUL_SYLLABLE,
|
||||
ACRONYM
|
||||
])
|
||||
|
||||
NUMERIC_TOKEN_TYPES = set([
|
||||
NUMERIC,
|
||||
ORDINAL,
|
||||
ROMAN_NUMERAL,
|
||||
IDEOGRAPHIC_NUMBER,
|
||||
])
|
||||
|
||||
PUNCTUATION_TOKEN_TYPES = set([
|
||||
PERIOD,
|
||||
EXCLAMATION,
|
||||
QUESTION_MARK,
|
||||
COMMA,
|
||||
COLON,
|
||||
SEMICOLON,
|
||||
PLUS,
|
||||
AMPERSAND,
|
||||
AT_SIGN,
|
||||
POUND,
|
||||
ELLIPSIS,
|
||||
DASH,
|
||||
BREAKING_DASH,
|
||||
HYPHEN,
|
||||
PUNCT_OPEN,
|
||||
PUNCT_CLOSE,
|
||||
DOUBLE_QUOTE,
|
||||
SINGLE_QUOTE,
|
||||
OPEN_QUOTE,
|
||||
CLOSE_QUOTE,
|
||||
SLASH,
|
||||
BACKSLASH,
|
||||
GREATER_THAN,
|
||||
LESS_THAN,
|
||||
])
|
||||
|
||||
NON_ALPHANUMERIC_TOKEN_TYPES = PUNCTUATION_TOKEN_TYPES | set([
|
||||
OTHER,
|
||||
WHITESPACE,
|
||||
NEWLINE,
|
||||
])
|
||||
11
scripts/geodata/text/tokenize.py
Normal file
11
scripts/geodata/text/tokenize.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from geodata.encoding import safe_encode, safe_decode
|
||||
from geodata.text import _tokenize
|
||||
from geodata.text.token_types import token_types
|
||||
|
||||
|
||||
|
||||
def tokenize(s, whitespace=False):
|
||||
u = safe_decode(s)
|
||||
s = safe_encode(s)
|
||||
return [(safe_decode(s[start:start + length]), token_types.from_id(token_type))
|
||||
for start, length, token_type in _tokenize.tokenize(u, whitespace)]
|
||||
16
scripts/geodata/text/utils.py
Normal file
16
scripts/geodata/text/utils.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import re
|
||||
|
||||
from geodata.text.tokenize import tokenize
|
||||
from geodata.text.token_types import token_types
|
||||
|
||||
non_breaking_dash_regex = re.compile(u'[\-\u058a\u05be\u1400\u1806\u2010-\u2013\u2212\u2e17\u2e1a\ufe32\ufe63\uff0d]', re.UNICODE)
|
||||
|
||||
|
||||
def is_numeric(s):
|
||||
tokens = tokenize(s)
|
||||
return sum((1 for t, c in tokens if c in token_types.NUMERIC_TOKEN_TYPES)) == len(tokens)
|
||||
|
||||
|
||||
def is_numeric_strict(s):
|
||||
tokens = tokenize(s)
|
||||
return sum((1 for t, c in tokens if c == token_types.NUMERIC)) == len(tokens)
|
||||
Reference in New Issue
Block a user