From 58e53cab1c39da35f67b93ed30393943100e65fd Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 12 Jan 2016 13:26:55 -0500 Subject: [PATCH] [scripts] Adding the tokenize/normalize wrappers directly into the internal geodata package so pypostal can be maintained in an independent repo --- .../geodata/address_formatting/formatter.py | 2 +- scripts/geodata/language_id/disambiguation.py | 4 +- scripts/geodata/polygons/reverse_geocode.py | 4 +- scripts/geodata/text/__init__.py | 0 scripts/geodata/text/normalize.py | 84 +++++ scripts/geodata/text/pynormalize.c | 328 ++++++++++++++++++ scripts/geodata/text/pytokenize.c | 164 +++++++++ scripts/geodata/text/token_types.py | 68 ++++ scripts/geodata/text/tokenize.py | 14 + scripts/setup.py | 68 ++++ 10 files changed, 731 insertions(+), 5 deletions(-) create mode 100644 scripts/geodata/text/__init__.py create mode 100644 scripts/geodata/text/normalize.py create mode 100644 scripts/geodata/text/pynormalize.c create mode 100644 scripts/geodata/text/pytokenize.c create mode 100644 scripts/geodata/text/token_types.py create mode 100644 scripts/geodata/text/tokenize.py create mode 100644 scripts/setup.py diff --git a/scripts/geodata/address_formatting/formatter.py b/scripts/geodata/address_formatting/formatter.py index c853483c..533a8ebc 100644 --- a/scripts/geodata/address_formatting/formatter.py +++ b/scripts/geodata/address_formatting/formatter.py @@ -5,7 +5,7 @@ import re import subprocess import yaml -from postal.text.tokenize import tokenize, tokenize_raw, token_types +from geodata.text.tokenize import tokenize, tokenize_raw, token_types from collections import OrderedDict, defaultdict from itertools import ifilter diff --git a/scripts/geodata/language_id/disambiguation.py b/scripts/geodata/language_id/disambiguation.py index 2ff2717a..b107784d 100644 --- a/scripts/geodata/language_id/disambiguation.py +++ b/scripts/geodata/language_id/disambiguation.py @@ -11,13 +11,13 @@ sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir, os.pardir, 'python'))) from address_normalizer.text.normalize import PhraseFilter -from address_normalizer.text.tokenize import token_types from geodata.encoding import safe_decode from geodata.string_utils import wide_iter, wide_ord from geodata.i18n.unicode_paths import DATA_DIR from geodata.i18n.normalize import strip_accents from geodata.i18n.unicode_properties import get_chars_by_script, get_script_languages -from postal.text.tokenize import tokenize +from geodata.text.tokenize import tokenize +from geodata.text.tokenize import token_types WELL_REPRESENTED_LANGUAGES = set(['en', 'fr', 'it', 'de', 'nl', 'es', 'pt']) diff --git a/scripts/geodata/polygons/reverse_geocode.py b/scripts/geodata/polygons/reverse_geocode.py index d53e886f..3fba9df1 100644 --- a/scripts/geodata/polygons/reverse_geocode.py +++ b/scripts/geodata/polygons/reverse_geocode.py @@ -38,8 +38,8 @@ from geodata.osm.osm_admin_boundaries import OSMAdminPolygonReader from geodata.polygons.index import * from geodata.statistics.tf_idf import IDFIndex -from postal.text.tokenize import tokenize, token_types -from postal.text.normalize import * +from geodata.text.tokenize import tokenize, token_types +from geodata.text.normalize import * decode_latin1 = partial(safe_decode, encoding='latin1') diff --git a/scripts/geodata/text/__init__.py b/scripts/geodata/text/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/geodata/text/normalize.py b/scripts/geodata/text/normalize.py new file mode 100644 index 00000000..15c4e067 --- /dev/null +++ b/scripts/geodata/text/normalize.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- +from geodata.text import _normalize +from geodata.text.tokenize import tokenize_raw +from geodata.text.token_types import token_types + +from geodata.encoding import safe_decode + +# String options +NORMALIZE_STRING_LATIN_ASCII = _normalize.NORMALIZE_STRING_LATIN_ASCII +NORMALIZE_STRING_TRANSLITERATE = _normalize.NORMALIZE_STRING_TRANSLITERATE +NORMALIZE_STRING_STRIP_ACCENTS = _normalize.NORMALIZE_STRING_STRIP_ACCENTS +NORMALIZE_STRING_DECOMPOSE = _normalize.NORMALIZE_STRING_DECOMPOSE +NORMALIZE_STRING_LOWERCASE = _normalize.NORMALIZE_STRING_LOWERCASE +NORMALIZE_STRING_TRIM = _normalize.NORMALIZE_STRING_TRIM +NORMALIZE_STRING_REPLACE_HYPHENS = _normalize.NORMALIZE_STRING_REPLACE_HYPHENS + +DEFAULT_STRING_OPTIONS = NORMALIZE_STRING_LATIN_ASCII | \ + NORMALIZE_STRING_DECOMPOSE | \ + NORMALIZE_STRING_TRIM | \ + NORMALIZE_STRING_REPLACE_HYPHENS | \ + NORMALIZE_STRING_STRIP_ACCENTS | \ + NORMALIZE_STRING_LOWERCASE + +# Token options +NORMALIZE_TOKEN_REPLACE_HYPHENS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS +NORMALIZE_TOKEN_DELETE_HYPHENS = _normalize.NORMALIZE_TOKEN_DELETE_HYPHENS +NORMALIZE_TOKEN_DELETE_FINAL_PERIOD = _normalize.NORMALIZE_TOKEN_DELETE_FINAL_PERIOD +NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS = _normalize.NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS +NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES = _normalize.NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES +NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE = _normalize.NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE +NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC = _normalize.NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC +NORMALIZE_TOKEN_REPLACE_DIGITS = _normalize.NORMALIZE_TOKEN_REPLACE_DIGITS + +DEFAULT_TOKEN_OPTIONS = NORMALIZE_TOKEN_REPLACE_HYPHENS | \ + NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \ + NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \ + NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \ + NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE | \ + NORMALIZE_TOKEN_REPLACE_DIGITS + + +def remove_parens(tokens): + new_tokens = [] + open_parens = 0 + for t, c in tokens: + if c == token_types.PUNCT_OPEN: + open_parens += 1 + elif c == token_types.PUNCT_CLOSE: + if open_parens > 0: + open_parens -= 1 + elif open_parens <= 0: + new_tokens.append((t, c)) + return new_tokens + + +def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS, + token_options=DEFAULT_TOKEN_OPTIONS, + strip_parentheticals=True): + ''' + Normalizes a string, tokenizes, and normalizes each token + with string and token-level options. + + This version only uses libpostal's deterministic normalizations + i.e. methods with a single output. The string tree version will + return multiple normalized strings, each with tokens. + + Usage: + normalized_tokens(u'St.-Barthélemy') + ''' + s = safe_decode(s) + if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII: + normalized = _normalize.normalize_string_latin(s, string_options) + else: + normalized = _normalize.normalize_string_utf8(s, string_options) + + # Tuples of (offset, len, type) + raw_tokens = tokenize_raw(normalized) + tokens = [(_normalize.normalize_token(normalized, t, token_options), + token_types.from_id(t[-1])) for t in raw_tokens] + + if strip_parentheticals: + return remove_parens(tokens) + else: + return tokens diff --git a/scripts/geodata/text/pynormalize.c b/scripts/geodata/text/pynormalize.c new file mode 100644 index 00000000..46b959f9 --- /dev/null +++ b/scripts/geodata/text/pynormalize.c @@ -0,0 +1,328 @@ +#include + +#include "src/normalize.h" +#include "src/transliterate.h" + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#endif + +struct module_state { + PyObject *error; +}; + + +#ifdef IS_PY3K + #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) +#else + #define GETSTATE(m) (&_state) + static struct module_state _state; +#endif + + + +static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args) +{ + PyObject *arg1; + uint64_t options; + if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) { + return 0; + } + + PyObject *unistr = PyUnicode_FromObject(arg1); + if (unistr == NULL) { + PyErr_SetString(PyExc_TypeError, + "Parameter could not be converted to unicode in scanner"); + return 0; + } + + #ifdef IS_PY3K + // Python 3 encoding, supported by Python 3.3+ + + char *input = PyUnicode_AsUTF8(unistr); + + #else + // Python 2 encoding + + PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict"); + if (str == NULL) { + PyErr_SetString(PyExc_TypeError, + "Parameter could not be utf-8 encoded"); + goto exit_decref_unistr; + } + + char *input = PyBytes_AsString(str); + + #endif + + if (input == NULL) { + goto exit_decref_str; + } + + char *normalized = normalize_string_utf8(input, options); + + if (normalized == NULL) { + goto exit_decref_str; + } + + PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict"); + free(normalized); + if (result == NULL) { + PyErr_SetString(PyExc_ValueError, + "Result could not be utf-8 decoded"); + goto exit_decref_str; + } + + #ifndef IS_PY3K + Py_XDECREF(str); + #endif + Py_XDECREF(unistr); + + return result; + +exit_decref_str: +#ifndef IS_PY3K + Py_XDECREF(str); +#endif +exit_decref_unistr: + Py_XDECREF(unistr); + return 0; +} + + +static PyObject *py_normalize_string_latin(PyObject *self, PyObject *args) +{ + PyObject *arg1; + uint64_t options; + if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) { + return 0; + } + + PyObject *unistr = PyUnicode_FromObject(arg1); + if (unistr == NULL) { + PyErr_SetString(PyExc_TypeError, + "Parameter could not be converted to unicode in scanner"); + return 0; + } + + #ifdef IS_PY3K + // Python 3 encoding, supported by Python 3.3+ + + char *input = PyUnicode_AsUTF8(unistr); + + #else + // Python 2 encoding + + PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict"); + if (str == NULL) { + PyErr_SetString(PyExc_TypeError, + "Parameter could not be utf-8 encoded"); + goto exit_decref_unistr; + } + + char *input = PyBytes_AsString(str); + + #endif + + if (input == NULL) { + goto exit_decref_str; + } + + char *normalized = normalize_string_latin(input, strlen(input), options); + + PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict"); + free(normalized); + if (result == NULL) { + PyErr_SetString(PyExc_ValueError, + "Result could not be utf-8 decoded"); + goto exit_decref_str; + } + + #ifndef IS_PY3K + Py_XDECREF(str); + #endif + Py_XDECREF(unistr); + + return result; + +exit_decref_str: +#ifndef IS_PY3K + Py_XDECREF(str); +#endif +exit_decref_unistr: + Py_XDECREF(unistr); + return 0; +} + + + +static PyObject *py_normalize_token(PyObject *self, PyObject *args) +{ + PyObject *s; + + uint32_t offset; + uint32_t len; + uint16_t type; + + uint64_t options; + if (!PyArg_ParseTuple(args, "O(IIH)K:normalize", &s, &offset, &len, &type, &options)) { + PyErr_SetString(PyExc_TypeError, + "Error parsing arguments"); + return 0; + } + + token_t token = (token_t){(size_t)offset, (size_t)len, type}; + + PyObject *unistr = PyUnicode_FromObject(s); + if (unistr == NULL) { + PyErr_SetString(PyExc_TypeError, + "Parameter could not be converted to unicode in scanner"); + return 0; + } + + #ifdef IS_PY3K + // Python 3 encoding, supported by Python 3.3+ + + char *input = PyUnicode_AsUTF8(unistr); + + #else + // Python 2 encoding + + PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict"); + if (str == NULL) { + PyErr_SetString(PyExc_ValueError, + "Parameter could not be utf-8 encoded"); + goto exit_decref_unistr; + } + + char *input = PyBytes_AsString(str); + + #endif + + if (input == NULL) { + goto exit_decref_str; + } + + char_array *token_buffer = char_array_new_size(token.len); + + add_normalized_token(token_buffer, input, token, options); + char *token_str = char_array_get_string(token_buffer); + PyObject *result = PyUnicode_DecodeUTF8((const char *)token_str, token_buffer->n - 1, "strict"); + + if (result == NULL) { + PyErr_SetString(PyExc_ValueError, + "Error decoding token"); + char_array_destroy(token_buffer); + goto exit_decref_str; + } + + char_array_destroy(token_buffer); + + #ifndef IS_PY3K + Py_XDECREF(str); + #endif + Py_XDECREF(unistr); + + return result; + +exit_decref_str: +#ifndef IS_PY3K + Py_XDECREF(str); +#endif +exit_decref_unistr: + Py_XDECREF(unistr); + return 0; +} + +static PyMethodDef normalize_methods[] = { + {"normalize_string_utf8", (PyCFunction)py_normalize_string_utf8, METH_VARARGS, "normalize_string_utf8(input, options)"}, + {"normalize_string_latin", (PyCFunction)py_normalize_string_latin, METH_VARARGS, "normalize_string_latin(input, options)"}, + {"normalize_token", (PyCFunction)py_normalize_token, METH_VARARGS, "normalize_token(input, options)"}, + {NULL, NULL}, +}; + + + +#ifdef IS_PY3K + +static int normalize_traverse(PyObject *m, visitproc visit, void *arg) { + Py_VISIT(GETSTATE(m)->error); + return 0; +} + +static int normalize_clear(PyObject *m) { + Py_CLEAR(GETSTATE(m)->error); + return 0; +} + + +static struct PyModuleDef module_def = { + PyModuleDef_HEAD_INIT, + "_normalize", + NULL, + sizeof(struct module_state), + normalize_methods, + NULL, + normalize_traverse, + normalize_clear, + NULL +}; + +#define INITERROR return NULL + +PyObject * +PyInit_normalize(void) { +#else +#define INITERROR return + +void +init_normalize(void) { +#endif + +#ifdef IS_PY3K + PyObject *module = PyModule_Create(&module_def); +#else + PyObject *module = Py_InitModule("_normalize", normalize_methods); +#endif + + if (module == NULL) + INITERROR; + struct module_state *st = GETSTATE(module); + + st->error = PyErr_NewException("_normalize.Error", NULL, NULL); + if (st->error == NULL) { + Py_DECREF(module); + INITERROR; + } + + if (!transliteration_module_setup(NULL)) { + PyErr_SetString(PyExc_RuntimeError, + "Could not load transliterate module"); + Py_DECREF(module); + INITERROR; + } + + + PyModule_AddObject(module, "NORMALIZE_STRING_LATIN_ASCII", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LATIN_ASCII)); + PyModule_AddObject(module, "NORMALIZE_STRING_TRANSLITERATE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRANSLITERATE)); + PyModule_AddObject(module, "NORMALIZE_STRING_STRIP_ACCENTS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_STRIP_ACCENTS)); + PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_DECOMPOSE)); + PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LOWERCASE)); + PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRIM)); + PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_REPLACE_HYPHENS)); + + + PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_HYPHENS)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_HYPHENS)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_FINAL_PERIOD", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_FINAL_PERIOD)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_DIGITS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_DIGITS)); + + +#if PY_MAJOR_VERSION >= 3 + return module; +#endif +} \ No newline at end of file diff --git a/scripts/geodata/text/pytokenize.c b/scripts/geodata/text/pytokenize.c new file mode 100644 index 00000000..7986bae3 --- /dev/null +++ b/scripts/geodata/text/pytokenize.c @@ -0,0 +1,164 @@ +#include + +#include "src/scanner.h" + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#endif + +struct module_state { + PyObject *error; +}; + + +#ifdef IS_PY3K + #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) +#else + #define GETSTATE(m) (&_state) + static struct module_state _state; +#endif + + +static PyObject *py_tokenize(PyObject *self, PyObject *args) +{ + PyObject *arg1; + if (!PyArg_ParseTuple(args, "O:tokenize", &arg1)) { + return 0; + } + + PyObject *unistr = PyUnicode_FromObject(arg1); + if (unistr == NULL) { + PyErr_SetString(PyExc_TypeError, + "Parameter could not be converted to unicode in scanner"); + return 0; + } + + #ifdef IS_PY3K + // Python 3 encoding, supported by Python 3.3+ + + char *input = PyUnicode_AsUTF8(unistr); + + #else + // Python 2 encoding + + PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict"); + if (str == NULL) { + PyErr_SetString(PyExc_TypeError, + "Parameter could not be utf-8 encoded"); + goto error_decref_unistr; + } + + char *input = PyBytes_AsString(str); + + #endif + + + if (input == NULL) { + goto error_decref_str; + } + + token_array *tokens = tokenize(input); + if (tokens == NULL) { + goto error_decref_str; + } + + PyObject *result = PyTuple_New(tokens->n); + if (!result) { + token_array_destroy(tokens); + goto error_decref_str; + return 0; + } + + PyObject *tuple; + + token_t token; + for (size_t i = 0; i < tokens->n; i++) { + token = tokens->a[i]; + tuple = Py_BuildValue("III", token.offset, token.len, token.type); + if (PyTuple_SetItem(result, i, tuple) < 0) { + token_array_destroy(tokens); + goto error_decref_str; + } + } + + #ifndef IS_PY3K + Py_XDECREF(str); + #endif + Py_XDECREF(unistr); + + token_array_destroy(tokens); + + return result; + +error_decref_str: +#ifndef IS_PY3K + Py_XDECREF(str); +#endif +error_decref_unistr: + Py_XDECREF(unistr); + return 0; +} + +static PyMethodDef tokenize_methods[] = { + {"tokenize", (PyCFunction)py_tokenize, METH_VARARGS, "tokenize(text)"}, + {NULL, NULL}, +}; + + + +#ifdef IS_PY3K + +static int tokenize_traverse(PyObject *m, visitproc visit, void *arg) { + Py_VISIT(GETSTATE(m)->error); + return 0; +} + +static int tokenize_clear(PyObject *m) { + Py_CLEAR(GETSTATE(m)->error); + return 0; +} + + +static struct PyModuleDef module_def = { + PyModuleDef_HEAD_INIT, + "_tokenize", + NULL, + sizeof(struct module_state), + tokenize_methods, + NULL, + tokenize_traverse, + tokenize_clear, + NULL +}; + +#define INITERROR return NULL + +PyObject * +PyInit_tokenize(void) { +#else +#define INITERROR return + +void +init_tokenize(void) { +#endif + +#ifdef IS_PY3K + PyObject *module = PyModule_Create(&module_def); +#else + PyObject *module = Py_InitModule("_tokenize", tokenize_methods); +#endif + + if (module == NULL) + INITERROR; + struct module_state *st = GETSTATE(module); + + st->error = PyErr_NewException("_tokenize.Error", NULL, NULL); + if (st->error == NULL) { + Py_DECREF(module); + INITERROR; + } + +#if PY_MAJOR_VERSION >= 3 + return module; +#endif +} \ No newline at end of file diff --git a/scripts/geodata/text/token_types.py b/scripts/geodata/text/token_types.py new file mode 100644 index 00000000..021b7918 --- /dev/null +++ b/scripts/geodata/text/token_types.py @@ -0,0 +1,68 @@ +from geodata.enum import Enum, EnumValue + + +class token_types(Enum): + # Word types + WORD = EnumValue(1) + ABBREVIATION = EnumValue(2) + IDEOGRAPHIC_CHAR = EnumValue(3) + HANGUL_SYLLABLE = EnumValue(4) + ACRONYM = EnumValue(5) + + # Special tokens + EMAIL = EnumValue(20) + URL = EnumValue(21) + US_PHONE = EnumValue(22) + INTL_PHONE = EnumValue(23) + + # Numbers and numeric types + NUMERIC = EnumValue(50) + ORDINAL = EnumValue(51) + ROMAN_NUMERAL = EnumValue(52) + IDEOGRAPHIC_NUMBER = EnumValue(53) + + # Punctuation types, may separate a phrase + PERIOD = EnumValue(100) + EXCLAMATION = EnumValue(101) + QUESTION_MARK = EnumValue(102) + COMMA = EnumValue(103) + COLON = EnumValue(104) + SEMICOLON = EnumValue(105) + PLUS = EnumValue(106) + AMPERSAND = EnumValue(107) + AT_SIGN = EnumValue(108) + POUND = EnumValue(109) + ELLIPSIS = EnumValue(110) + DASH = EnumValue(111) + BREAKING_DASH = EnumValue(112) + HYPHEN = EnumValue(113) + PUNCT_OPEN = EnumValue(114) + PUNCT_CLOSE = EnumValue(115) + DOUBLE_QUOTE = EnumValue(119) + SINGLE_QUOTE = EnumValue(120) + OPEN_QUOTE = EnumValue(121) + CLOSE_QUOTE = EnumValue(122) + SLASH = EnumValue(124) + BACKSLASH = EnumValue(125) + GREATER_THAN = EnumValue(126) + LESS_THAN = EnumValue(127) + + # Non-letters and whitespace + OTHER = EnumValue(200) + WHITESPACE = EnumValue(300) + NEWLINE = EnumValue(301) + + WORD_TOKEN_TYPES = set([ + WORD, + ABBREVIATION, + IDEOGRAPHIC_CHAR, + HANGUL_SYLLABLE, + ACRONYM + ]) + + NUMERIC_TOKEN_TYPES = set([ + NUMERIC, + ORDINAL, + ROMAN_NUMERAL, + IDEOGRAPHIC_NUMBER, + ]) diff --git a/scripts/geodata/text/tokenize.py b/scripts/geodata/text/tokenize.py new file mode 100644 index 00000000..d3d18832 --- /dev/null +++ b/scripts/geodata/text/tokenize.py @@ -0,0 +1,14 @@ +from geodata.encoding import safe_encode, safe_decode +from geodata.text import _tokenize +from geodata.text.token_types import token_types + + +def tokenize_raw(s): + return _tokenize.tokenize(safe_decode(s)) + + +def tokenize(s): + u = safe_decode(s) + s = safe_encode(s) + return [(safe_decode(s[start:start + length]), token_types.from_id(token_type)) + for start, length, token_type in _tokenize.tokenize(u)] diff --git a/scripts/setup.py b/scripts/setup.py new file mode 100644 index 00000000..c79f659a --- /dev/null +++ b/scripts/setup.py @@ -0,0 +1,68 @@ +import os + +from setuptools import setup, Extension, find_packages + +this_dir = os.path.realpath(os.path.dirname(__file__)) +PROJECT_DIR = os.path.realpath(os.path.join(this_dir, os.pardir)) +SRC_DIR = os.path.join(PROJECT_DIR, 'src') + + +def main(): + setup( + name='geodata', + version='0.1', + install_requires=[ + 'fiona', + 'lxml', + 'marisa_trie', + 'pycountry', + 'pyproj', + 'python-Levenshtein', + 'requests', + 'rtree', + 'shapely', + 'six', + 'ujson', + ], + packages=find_packages(), + ext_modules=[ + Extension('geodata.text._tokenize', + sources=[os.path.join(SRC_DIR, f) + for f in ('scanner.c', + 'string_utils.c', + 'tokens.c', + 'utf8proc/utf8proc.c', + ) + ] + ['geodata/text/pytokenize.c'], + include_dirs=[os.path.join(this_dir, os.pardir)], + extra_compile_args=['-O0', '-std=c99', + '-Wno-unused-function'], + ), + Extension('geodata.text._normalize', + sources=[os.path.join(SRC_DIR, f) + for f in ('normalize.c', + 'string_utils.c', + 'utf8proc/utf8proc.c', + 'tokens.c', + 'unicode_scripts.c', + 'transliterate.c', + 'file_utils.c', + 'trie.c', + 'trie_search.c',) + ] + ['geodata/text/pynormalize.c'], + include_dirs=[os.path.join(this_dir, os.pardir)], + extra_compile_args=['-std=c99', '-DHAVE_CONFIG_H', + '-Wno-unused-function'], + ), + ], + include_package_data=True, + zip_safe=False, + url='http://mapzen.com', + description='Utilities for working with geographic data', + license='MIT License', + maintainer='mapzen.com', + maintainer_email='pelias@mapzen.com' + ) + +if __name__ == '__main__': + main()