Initial fork commit

2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions
--- a/scripts/geodata/text/init.py
+++ b/scripts/geodata/text/init.py
--- a/scripts/geodata/text/normalize.py
+++ b/scripts/geodata/text/normalize.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+import six
+
+from geodata.text import _normalize
+from geodata.text.token_types import token_types
+
+from geodata.encoding import safe_decode
+
+# String options
+NORMALIZE_STRING_LATIN_ASCII = _normalize.NORMALIZE_STRING_LATIN_ASCII
+NORMALIZE_STRING_TRANSLITERATE = _normalize.NORMALIZE_STRING_TRANSLITERATE
+NORMALIZE_STRING_STRIP_ACCENTS = _normalize.NORMALIZE_STRING_STRIP_ACCENTS
+NORMALIZE_STRING_DECOMPOSE = _normalize.NORMALIZE_STRING_DECOMPOSE
+NORMALIZE_STRING_LOWERCASE = _normalize.NORMALIZE_STRING_LOWERCASE
+NORMALIZE_STRING_TRIM = _normalize.NORMALIZE_STRING_TRIM
+NORMALIZE_STRING_REPLACE_HYPHENS = _normalize.NORMALIZE_STRING_REPLACE_HYPHENS
+NORMALIZE_STRING_SIMPLE_LATIN_ASCII = _normalize.NORMALIZE_STRING_SIMPLE_LATIN_ASCII
+
+DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_DEFAULT_STRING_OPTIONS
+
+# Token options
+NORMALIZE_TOKEN_REPLACE_HYPHENS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS
+NORMALIZE_TOKEN_DELETE_HYPHENS = _normalize.NORMALIZE_TOKEN_DELETE_HYPHENS
+NORMALIZE_TOKEN_DELETE_FINAL_PERIOD = _normalize.NORMALIZE_TOKEN_DELETE_FINAL_PERIOD
+NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS = _normalize.NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS
+NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES = _normalize.NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES
+NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE = _normalize.NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE
+NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC = _normalize.NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC
+NORMALIZE_TOKEN_REPLACE_DIGITS = _normalize.NORMALIZE_TOKEN_REPLACE_DIGITS
+
+DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_DEFAULT_TOKEN_OPTIONS
+
+TOKEN_OPTIONS_DROP_PERIODS = _normalize.NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS
+DEFAULT_TOKEN_OPTIONS_NUMERIC = _normalize.NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC
+
+
+def remove_parens(tokens):
+    new_tokens = []
+    open_parens = 0
+    for t, c in tokens:
+        if c == token_types.PUNCT_OPEN:
+            open_parens += 1
+        elif c == token_types.PUNCT_CLOSE:
+            if open_parens > 0:
+                open_parens -= 1
+        elif open_parens <= 0:
+            new_tokens.append((t, c))
+    return new_tokens
+
+
+def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS):
+    s = safe_decode(s)
+    return _normalize.normalize_string(s, string_options)
+
+
+def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
+                      token_options=DEFAULT_TOKEN_OPTIONS,
+                      strip_parentheticals=True, whitespace=False):
+    '''
+    Normalizes a string, tokenizes, and normalizes each token
+    with string and token-level options.
+
+    This version only uses libpostal's deterministic normalizations
+    i.e. methods with a single output. The string tree version will
+    return multiple normalized strings, each with tokens.
+
+    Usage:
+        normalized_tokens(u'St.-Barthélemy')
+    '''
+    s = safe_decode(s)
+    normalized_tokens = _normalize.normalized_tokens(s, string_options, token_options, whitespace)
+
+    if strip_parentheticals:
+        normalized_tokens = remove_parens(normalized_tokens)
+
+    return [(s, token_types.from_id(token_type)) for s, token_type in normalized_tokens]
--- a/scripts/geodata/text/phrases.py
+++ b/scripts/geodata/text/phrases.py
@@ -0,0 +1,75 @@
+import six
+
+from collections import *
+from marisa_trie import BytesTrie
+from geodata.encoding import safe_encode, safe_decode
+
+SENTINEL = None
+
+
+class PhraseFilter(object):
+    def __init__(self, phrases):
+        if hasattr(phrases, 'items'):
+            phrases = six.iteritems(phrases)
+        vals = [(safe_decode(key), self.serialize(val)) for key, val in phrases]
+        self.trie = BytesTrie(vals)
+
+    serialize = staticmethod(safe_encode)
+    deserialize = staticmethod(safe_decode)
+
+    def filter(self, tokens):
+        def return_item(item):
+            return False, item, []
+
+        if not tokens:
+            return
+
+        ent = []
+        ent_tokens = []
+
+        queue = deque(tokens + [(SENTINEL,) * 2])
+        skip_until = 0
+
+        trie = self.trie
+
+        while queue:
+            item = queue.popleft()
+            t, c = item
+
+            if t is not SENTINEL and trie.has_keys_with_prefix(u' '.join(ent_tokens + [t])):
+                ent.append(item)
+                ent_tokens.append(item[0])
+            elif ent_tokens:
+                res = trie.get(u' '.join(ent_tokens)) or None
+                if res is not None:
+                    yield (True, ent, map(self.deserialize, res))
+                    queue.appendleft(item)
+                    ent = []
+                    ent_tokens = []
+                elif len(ent_tokens) == 1:
+                    yield return_item(ent[0])
+                    ent = []
+                    ent_tokens = []
+                    queue.appendleft(item)
+                else:
+                    have_phrase = False
+
+                    for i in xrange(len(ent) - 1, 0, -1):
+                        remainder = ent[i:]
+                        res = trie.get(u' '.join([e[0] for e in ent[:i]])) or None
+                        if res is not None:
+                            yield (True, ent[:i], map(self.deserialize, res))
+                            have_phrase = True
+                            break
+
+                    if not have_phrase:
+                        yield return_item(ent[0])
+
+                    todos = list(remainder)
+                    todos.append(item)
+                    queue.extendleft(reversed(todos))
+
+                    ent = []
+                    ent_tokens = []
+            elif t is not SENTINEL:
+                yield return_item(item)
--- a/scripts/geodata/text/pynormalize.c
+++ b/scripts/geodata/text/pynormalize.c
@@ -0,0 +1,291 @@
+#include <Python.h>
+
+#include <libpostal/libpostal.h>
+
+#if PY_MAJOR_VERSION >= 3
+#define IS_PY3K
+#endif
+
+struct module_state {
+    PyObject *error;
+};
+
+
+#ifdef IS_PY3K
+    #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
+#else
+    #define GETSTATE(m) (&_state)
+    static struct module_state _state;
+#endif
+
+static PyObject *py_normalize_string(PyObject *self, PyObject *args) 
+{
+    PyObject *arg1;
+    uint64_t options;
+    if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) {
+        return 0;
+    }
+
+    PyObject *unistr = PyUnicode_FromObject(arg1);
+    if (unistr == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                        "Parameter could not be converted to unicode in scanner");
+        return 0;
+    }
+
+    #ifdef IS_PY3K
+        // Python 3 encoding, supported by Python 3.3+
+
+        char *input = PyUnicode_AsUTF8(unistr);
+
+    #else
+        // Python 2 encoding
+
+        PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
+        if (str == NULL) {
+            PyErr_SetString(PyExc_TypeError,
+                            "Parameter could not be utf-8 encoded");
+            goto exit_normalize_decref_unistr;
+        }
+
+        char *input = PyBytes_AsString(str);
+
+    #endif
+
+    if (input == NULL) {
+        goto exit_normalize_decref_str;
+    }
+
+    char *normalized = libpostal_normalize_string(input, options);
+
+    if (normalized == NULL) {
+        goto exit_normalize_decref_str;
+    }
+
+    PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict");
+    free(normalized);
+    if (result == NULL) {
+            PyErr_SetString(PyExc_ValueError,
+                            "Result could not be utf-8 decoded");
+            goto exit_normalize_decref_str;
+    }
+
+    #ifndef IS_PY3K
+    Py_XDECREF(str);
+    #endif
+    Py_XDECREF(unistr);
+
+    return result;
+
+exit_normalize_decref_str:
+#ifndef IS_PY3K
+    Py_XDECREF(str);
+#endif
+exit_normalize_decref_unistr:
+    Py_XDECREF(unistr);
+    return 0;
+}
+
+
+static PyObject *py_normalized_tokens(PyObject *self, PyObject *args) 
+{
+    PyObject *arg1;
+    uint64_t string_options = LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS;
+    uint64_t token_options = LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS;
+    uint32_t arg_whitespace = 0;
+
+    PyObject *result = NULL;
+
+    if (!PyArg_ParseTuple(args, "O|KKI:normalize", &arg1, &string_options, &token_options, &arg_whitespace)) {
+        return 0;
+    }
+
+    PyObject *unistr = PyUnicode_FromObject(arg1);
+    if (unistr == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                        "Parameter could not be converted to unicode in scanner");
+        return 0;
+    }
+
+    #ifdef IS_PY3K
+        // Python 3 encoding, supported by Python 3.3+
+
+        char *input = PyUnicode_AsUTF8(unistr);
+
+    #else
+        // Python 2 encoding
+
+        PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
+        if (str == NULL) {
+            PyErr_SetString(PyExc_TypeError,
+                            "Parameter could not be utf-8 encoded");
+            goto exit_normalized_tokens_decref_str;
+        }
+
+        char *input = PyBytes_AsString(str);
+
+    #endif
+
+    if (input == NULL) {
+        goto exit_normalized_tokens_decref_str;
+    }
+
+    bool whitespace = arg_whitespace;
+
+    size_t num_tokens;
+    libpostal_normalized_token_t *normalized_tokens = libpostal_normalized_tokens(input, string_options, token_options, whitespace, &num_tokens);
+
+    if (normalized_tokens == NULL) {
+        goto exit_normalized_tokens_decref_str;
+    }
+
+    result = PyList_New((Py_ssize_t)num_tokens);
+    if (!result) {
+        goto exit_free_normalized_tokens;
+    }
+
+    for (size_t i = 0; i < num_tokens; i++) {
+        libpostal_normalized_token_t normalized_token = normalized_tokens[i];
+        char *token_str = normalized_token.str;
+        PyObject *py_token = PyUnicode_DecodeUTF8((const char *)token_str, strlen(token_str), "strict");
+        if (py_token == NULL) {
+            Py_DECREF(result);
+            goto exit_free_normalized_tokens;
+        }
+
+        PyObject *t = PyTuple_New(2);
+        PyObject *py_token_type = PyInt_FromLong(normalized_token.token.type);
+
+        PyTuple_SetItem(t, 0, py_token);
+        PyTuple_SetItem(t, 1, py_token_type);
+
+        // Note: PyList_SetItem steals a reference, so don't worry about DECREF
+        PyList_SetItem(result, (Py_ssize_t)i, t);
+    }
+
+    for (size_t i = 0; i < num_tokens; i++) {
+        free(normalized_tokens[i].str);
+    }
+    free(normalized_tokens);
+
+    #ifndef IS_PY3K
+    Py_XDECREF(str);
+    #endif
+    Py_XDECREF(unistr);
+
+    return result;
+exit_free_normalized_tokens:
+    for (size_t i = 0; i < num_tokens; i++) {
+        free(normalized_tokens[i].str);
+    }
+    free(normalized_tokens);
+exit_normalized_tokens_decref_str:
+#ifndef IS_PY3K
+    Py_XDECREF(str);
+#endif
+exit_normalized_tokens_decref_unistr:
+    Py_XDECREF(unistr);
+    return 0;
+}
+
+
+static PyMethodDef normalize_methods[] = {
+    {"normalize_string", (PyCFunction)py_normalize_string, METH_VARARGS, "normalize_string(input, options)"},
+    {"normalized_tokens", (PyCFunction)py_normalized_tokens, METH_VARARGS, "normalize_token(input, string_options, token_options, whitespace)"},
+    {NULL, NULL},
+};
+
+
+
+#ifdef IS_PY3K
+
+static int normalize_traverse(PyObject *m, visitproc visit, void *arg) {
+    Py_VISIT(GETSTATE(m)->error);
+    return 0;
+}
+
+static int normalize_clear(PyObject *m) {
+    Py_CLEAR(GETSTATE(m)->error);
+    return 0;
+}
+
+
+static struct PyModuleDef module_def = {
+        PyModuleDef_HEAD_INIT,
+        "_normalize",
+        NULL,
+        sizeof(struct module_state),
+        normalize_methods,
+        NULL,
+        normalize_traverse,
+        normalize_clear,
+        NULL
+};
+
+#define INITERROR return NULL
+
+PyObject *
+PyInit_normalize(void) {
+#else
+#define INITERROR return
+
+void
+init_normalize(void) {
+#endif
+
+#ifdef IS_PY3K
+    PyObject *module = PyModule_Create(&module_def);
+#else
+    PyObject *module = Py_InitModule("_normalize", normalize_methods);
+#endif
+
+    if (module == NULL)
+        INITERROR;
+    struct module_state *st = GETSTATE(module);
+
+    st->error = PyErr_NewException("_normalize.Error", NULL, NULL);
+    if (st->error == NULL) {
+        Py_DECREF(module);
+        INITERROR;
+    }
+
+    if (!libpostal_setup()) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Could not load libpostal");
+        Py_DECREF(module);
+        INITERROR;
+    }
+
+    PyModule_AddObject(module, "NORMALIZE_STRING_LATIN_ASCII", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII));
+    PyModule_AddObject(module, "NORMALIZE_STRING_TRANSLITERATE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE));
+    PyModule_AddObject(module, "NORMALIZE_STRING_STRIP_ACCENTS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS));
+    PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE));
+    PyModule_AddObject(module, "NORMALIZE_STRING_COMPOSE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_COMPOSE));
+    PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_LOWERCASE));
+    PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_TRIM));
+    PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS));
+    PyModule_AddObject(module, "NORMALIZE_STRING_SIMPLE_LATIN_ASCII", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII));
+
+
+    PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS));
+    PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS));
+    PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_FINAL_PERIOD", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD));
+    PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS));
+    PyModule_AddObject(module, "NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES));
+    PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE));
+    PyModule_AddObject(module, "NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC));
+    PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_DIGITS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS));
+
+
+    PyModule_AddObject(module, "NORMALIZE_DEFAULT_STRING_OPTIONS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS));
+    PyModule_AddObject(module, "NORMALIZE_DEFAULT_TOKEN_OPTIONS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS));
+
+    PyModule_AddObject(module, "NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS));
+
+    PyModule_AddObject(module, "NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC));
+
+
+#if PY_MAJOR_VERSION >= 3
+    return module;
+#endif
+}
--- a/scripts/geodata/text/pytokenize.c
+++ b/scripts/geodata/text/pytokenize.c
@@ -0,0 +1,167 @@
+#include <Python.h>
+
+#include <libpostal/libpostal.h>
+
+#if PY_MAJOR_VERSION >= 3
+#define IS_PY3K
+#endif
+
+struct module_state {
+    PyObject *error;
+};
+
+
+#ifdef IS_PY3K
+    #define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
+#else
+    #define GETSTATE(m) (&_state)
+    static struct module_state _state;
+#endif
+
+static PyObject *py_tokenize(PyObject *self, PyObject *args) 
+{
+    PyObject *arg1;
+    uint32_t arg_whitespace = 0;
+
+    if (!PyArg_ParseTuple(args, "OI:tokenize", &arg1, &arg_whitespace)) {
+        return 0;
+    }
+
+    bool whitespace = arg_whitespace;
+
+    PyObject *unistr = PyUnicode_FromObject(arg1);
+    if (unistr == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                        "Parameter could not be converted to unicode in scanner");
+        return 0;
+    }
+
+    #ifdef IS_PY3K
+        // Python 3 encoding, supported by Python 3.3+
+
+        char *input = PyUnicode_AsUTF8(unistr);
+
+    #else
+        // Python 2 encoding
+
+        PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
+        if (str == NULL) {
+            PyErr_SetString(PyExc_TypeError,
+                            "Parameter could not be utf-8 encoded");
+            goto error_decref_unistr;
+        }
+
+        char *input = PyBytes_AsString(str);
+
+    #endif
+
+
+    if (input == NULL) {
+        goto error_decref_str;
+    }
+
+    size_t num_tokens;
+
+    libpostal_token_t *tokens = libpostal_tokenize(input, whitespace, &num_tokens);
+    if (tokens == NULL) {
+        goto error_decref_str;
+    }
+
+    PyObject *result = PyTuple_New(num_tokens);
+    if (!result) {
+        free(tokens);
+        goto error_decref_str;
+        return 0;
+    }
+
+    PyObject *tuple;
+
+    libpostal_token_t token;
+    for (size_t i = 0; i < num_tokens; i++) {
+        token = tokens[i];
+        tuple = Py_BuildValue("III", token.offset, token.len, token.type);
+        if (PyTuple_SetItem(result, i, tuple) < 0) {
+            free(tokens);
+            goto error_decref_str;
+        }
+    }
+
+    #ifndef IS_PY3K
+    Py_XDECREF(str);
+    #endif
+    Py_XDECREF(unistr);
+
+    free(tokens);
+
+    return result;
+
+error_decref_str:
+#ifndef IS_PY3K
+    Py_XDECREF(str);
+#endif
+error_decref_unistr:
+    Py_XDECREF(unistr);
+    return 0;
+}
+
+static PyMethodDef tokenize_methods[] = {
+    {"tokenize", (PyCFunction)py_tokenize, METH_VARARGS, "tokenize(text, whitespace)"},
+    {NULL, NULL},
+};
+
+#ifdef IS_PY3K
+
+static int tokenize_traverse(PyObject *m, visitproc visit, void *arg) {
+    Py_VISIT(GETSTATE(m)->error);
+    return 0;
+}
+
+static int tokenize_clear(PyObject *m) {
+    Py_CLEAR(GETSTATE(m)->error);
+    return 0;
+}
+
+
+static struct PyModuleDef module_def = {
+        PyModuleDef_HEAD_INIT,
+        "_tokenize",
+        NULL,
+        sizeof(struct module_state),
+        tokenize_methods,
+        NULL,
+        tokenize_traverse,
+        tokenize_clear,
+        NULL
+};
+
+#define INITERROR return NULL
+
+PyObject *
+PyInit_tokenize(void) {
+#else
+#define INITERROR return
+
+void
+init_tokenize(void) {
+#endif
+
+#ifdef IS_PY3K
+    PyObject *module = PyModule_Create(&module_def);
+#else
+    PyObject *module = Py_InitModule("_tokenize", tokenize_methods);
+#endif
+
+    if (module == NULL)
+        INITERROR;
+    struct module_state *st = GETSTATE(module);
+
+    st->error = PyErr_NewException("_tokenize.Error", NULL, NULL);
+    if (st->error == NULL) {
+        Py_DECREF(module);
+        INITERROR;
+    }
+
+#if PY_MAJOR_VERSION >= 3
+    return module;
+#endif
+}
--- a/scripts/geodata/text/token_types.py
+++ b/scripts/geodata/text/token_types.py
@@ -0,0 +1,104 @@
+from geodata.enum import Enum, EnumValue
+
+
+class token_types(Enum):
+    # Word types
+    WORD = EnumValue(1)
+    ABBREVIATION = EnumValue(2)
+    IDEOGRAPHIC_CHAR = EnumValue(3)
+    HANGUL_SYLLABLE = EnumValue(4)
+    ACRONYM = EnumValue(5)
+
+    # Special tokens
+    EMAIL = EnumValue(20)
+    URL = EnumValue(21)
+    US_PHONE = EnumValue(22)
+    INTL_PHONE = EnumValue(23)
+
+    # Numbers and numeric types
+    NUMERIC = EnumValue(50)
+    ORDINAL = EnumValue(51)
+    ROMAN_NUMERAL = EnumValue(52)
+    IDEOGRAPHIC_NUMBER = EnumValue(53)
+
+    # Punctuation types, may separate a phrase
+    PERIOD = EnumValue(100)
+    EXCLAMATION = EnumValue(101)
+    QUESTION_MARK = EnumValue(102)
+    COMMA = EnumValue(103)
+    COLON = EnumValue(104)
+    SEMICOLON = EnumValue(105)
+    PLUS = EnumValue(106)
+    AMPERSAND = EnumValue(107)
+    AT_SIGN = EnumValue(108)
+    POUND = EnumValue(109)
+    ELLIPSIS = EnumValue(110)
+    DASH = EnumValue(111)
+    BREAKING_DASH = EnumValue(112)
+    HYPHEN = EnumValue(113)
+    PUNCT_OPEN = EnumValue(114)
+    PUNCT_CLOSE = EnumValue(115)
+    DOUBLE_QUOTE = EnumValue(119)
+    SINGLE_QUOTE = EnumValue(120)
+    OPEN_QUOTE = EnumValue(121)
+    CLOSE_QUOTE = EnumValue(122)
+    SLASH = EnumValue(124)
+    BACKSLASH = EnumValue(125)
+    GREATER_THAN = EnumValue(126)
+    LESS_THAN = EnumValue(127)
+
+    # Non-letters and whitespace
+    OTHER = EnumValue(200)
+    WHITESPACE = EnumValue(300)
+    NEWLINE = EnumValue(301)
+
+    # Phrase, special application-level type not returned by the tokenizer
+    PHRASE = EnumValue(999)
+
+    WORD_TOKEN_TYPES = set([
+        WORD,
+        ABBREVIATION,
+        IDEOGRAPHIC_CHAR,
+        HANGUL_SYLLABLE,
+        ACRONYM
+    ])
+
+    NUMERIC_TOKEN_TYPES = set([
+        NUMERIC,
+        ORDINAL,
+        ROMAN_NUMERAL,
+        IDEOGRAPHIC_NUMBER,
+    ])
+
+    PUNCTUATION_TOKEN_TYPES = set([
+        PERIOD,
+        EXCLAMATION,
+        QUESTION_MARK,
+        COMMA,
+        COLON,
+        SEMICOLON,
+        PLUS,
+        AMPERSAND,
+        AT_SIGN,
+        POUND,
+        ELLIPSIS,
+        DASH,
+        BREAKING_DASH,
+        HYPHEN,
+        PUNCT_OPEN,
+        PUNCT_CLOSE,
+        DOUBLE_QUOTE,
+        SINGLE_QUOTE,
+        OPEN_QUOTE,
+        CLOSE_QUOTE,
+        SLASH,
+        BACKSLASH,
+        GREATER_THAN,
+        LESS_THAN,
+    ])
+
+    NON_ALPHANUMERIC_TOKEN_TYPES = PUNCTUATION_TOKEN_TYPES | set([
+        OTHER,
+        WHITESPACE,
+        NEWLINE,
+    ])
--- a/scripts/geodata/text/tokenize.py
+++ b/scripts/geodata/text/tokenize.py
@@ -0,0 +1,11 @@
+from geodata.encoding import safe_encode, safe_decode
+from geodata.text import _tokenize
+from geodata.text.token_types import token_types
+
+
+
+def tokenize(s, whitespace=False):
+    u = safe_decode(s)
+    s = safe_encode(s)
+    return [(safe_decode(s[start:start + length]), token_types.from_id(token_type))
+            for start, length, token_type in _tokenize.tokenize(u, whitespace)]
--- a/scripts/geodata/text/utils.py
+++ b/scripts/geodata/text/utils.py
@@ -0,0 +1,16 @@
+import re
+
+from geodata.text.tokenize import tokenize
+from geodata.text.token_types import token_types
+
+non_breaking_dash_regex = re.compile(u'[\-\u058a\u05be\u1400\u1806\u2010-\u2013\u2212\u2e17\u2e1a\ufe32\ufe63\uff0d]', re.UNICODE)
+
+
+def is_numeric(s):
+    tokens = tokenize(s)
+    return sum((1 for t, c in tokens if c in token_types.NUMERIC_TOKEN_TYPES)) == len(tokens)
+
+
+def is_numeric_strict(s):
+    tokens = tokenize(s)
+    return sum((1 for t, c in tokens if c == token_types.NUMERIC)) == len(tokens)