Initial fork commit

This commit is contained in:
2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions

View File

View File

@@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
import six
from geodata.text import _normalize
from geodata.text.token_types import token_types
from geodata.encoding import safe_decode
# String options
NORMALIZE_STRING_LATIN_ASCII = _normalize.NORMALIZE_STRING_LATIN_ASCII
NORMALIZE_STRING_TRANSLITERATE = _normalize.NORMALIZE_STRING_TRANSLITERATE
NORMALIZE_STRING_STRIP_ACCENTS = _normalize.NORMALIZE_STRING_STRIP_ACCENTS
NORMALIZE_STRING_DECOMPOSE = _normalize.NORMALIZE_STRING_DECOMPOSE
NORMALIZE_STRING_LOWERCASE = _normalize.NORMALIZE_STRING_LOWERCASE
NORMALIZE_STRING_TRIM = _normalize.NORMALIZE_STRING_TRIM
NORMALIZE_STRING_REPLACE_HYPHENS = _normalize.NORMALIZE_STRING_REPLACE_HYPHENS
NORMALIZE_STRING_SIMPLE_LATIN_ASCII = _normalize.NORMALIZE_STRING_SIMPLE_LATIN_ASCII
DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_DEFAULT_STRING_OPTIONS
# Token options
NORMALIZE_TOKEN_REPLACE_HYPHENS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS
NORMALIZE_TOKEN_DELETE_HYPHENS = _normalize.NORMALIZE_TOKEN_DELETE_HYPHENS
NORMALIZE_TOKEN_DELETE_FINAL_PERIOD = _normalize.NORMALIZE_TOKEN_DELETE_FINAL_PERIOD
NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS = _normalize.NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS
NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES = _normalize.NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES
NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE = _normalize.NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE
NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC = _normalize.NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC
NORMALIZE_TOKEN_REPLACE_DIGITS = _normalize.NORMALIZE_TOKEN_REPLACE_DIGITS
DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_DEFAULT_TOKEN_OPTIONS
TOKEN_OPTIONS_DROP_PERIODS = _normalize.NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS
DEFAULT_TOKEN_OPTIONS_NUMERIC = _normalize.NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC
def remove_parens(tokens):
new_tokens = []
open_parens = 0
for t, c in tokens:
if c == token_types.PUNCT_OPEN:
open_parens += 1
elif c == token_types.PUNCT_CLOSE:
if open_parens > 0:
open_parens -= 1
elif open_parens <= 0:
new_tokens.append((t, c))
return new_tokens
def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS):
s = safe_decode(s)
return _normalize.normalize_string(s, string_options)
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
token_options=DEFAULT_TOKEN_OPTIONS,
strip_parentheticals=True, whitespace=False):
'''
Normalizes a string, tokenizes, and normalizes each token
with string and token-level options.
This version only uses libpostal's deterministic normalizations
i.e. methods with a single output. The string tree version will
return multiple normalized strings, each with tokens.
Usage:
normalized_tokens(u'St.-Barthélemy')
'''
s = safe_decode(s)
normalized_tokens = _normalize.normalized_tokens(s, string_options, token_options, whitespace)
if strip_parentheticals:
normalized_tokens = remove_parens(normalized_tokens)
return [(s, token_types.from_id(token_type)) for s, token_type in normalized_tokens]

View File

@@ -0,0 +1,75 @@
import six
from collections import *
from marisa_trie import BytesTrie
from geodata.encoding import safe_encode, safe_decode
SENTINEL = None
class PhraseFilter(object):
def __init__(self, phrases):
if hasattr(phrases, 'items'):
phrases = six.iteritems(phrases)
vals = [(safe_decode(key), self.serialize(val)) for key, val in phrases]
self.trie = BytesTrie(vals)
serialize = staticmethod(safe_encode)
deserialize = staticmethod(safe_decode)
def filter(self, tokens):
def return_item(item):
return False, item, []
if not tokens:
return
ent = []
ent_tokens = []
queue = deque(tokens + [(SENTINEL,) * 2])
skip_until = 0
trie = self.trie
while queue:
item = queue.popleft()
t, c = item
if t is not SENTINEL and trie.has_keys_with_prefix(u' '.join(ent_tokens + [t])):
ent.append(item)
ent_tokens.append(item[0])
elif ent_tokens:
res = trie.get(u' '.join(ent_tokens)) or None
if res is not None:
yield (True, ent, map(self.deserialize, res))
queue.appendleft(item)
ent = []
ent_tokens = []
elif len(ent_tokens) == 1:
yield return_item(ent[0])
ent = []
ent_tokens = []
queue.appendleft(item)
else:
have_phrase = False
for i in xrange(len(ent) - 1, 0, -1):
remainder = ent[i:]
res = trie.get(u' '.join([e[0] for e in ent[:i]])) or None
if res is not None:
yield (True, ent[:i], map(self.deserialize, res))
have_phrase = True
break
if not have_phrase:
yield return_item(ent[0])
todos = list(remainder)
todos.append(item)
queue.extendleft(reversed(todos))
ent = []
ent_tokens = []
elif t is not SENTINEL:
yield return_item(item)

View File

@@ -0,0 +1,291 @@
#include <Python.h>
#include <libpostal/libpostal.h>
#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif
struct module_state {
PyObject *error;
};
#ifdef IS_PY3K
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
#else
#define GETSTATE(m) (&_state)
static struct module_state _state;
#endif
static PyObject *py_normalize_string(PyObject *self, PyObject *args)
{
PyObject *arg1;
uint64_t options;
if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) {
return 0;
}
PyObject *unistr = PyUnicode_FromObject(arg1);
if (unistr == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be converted to unicode in scanner");
return 0;
}
#ifdef IS_PY3K
// Python 3 encoding, supported by Python 3.3+
char *input = PyUnicode_AsUTF8(unistr);
#else
// Python 2 encoding
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
if (str == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be utf-8 encoded");
goto exit_normalize_decref_unistr;
}
char *input = PyBytes_AsString(str);
#endif
if (input == NULL) {
goto exit_normalize_decref_str;
}
char *normalized = libpostal_normalize_string(input, options);
if (normalized == NULL) {
goto exit_normalize_decref_str;
}
PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict");
free(normalized);
if (result == NULL) {
PyErr_SetString(PyExc_ValueError,
"Result could not be utf-8 decoded");
goto exit_normalize_decref_str;
}
#ifndef IS_PY3K
Py_XDECREF(str);
#endif
Py_XDECREF(unistr);
return result;
exit_normalize_decref_str:
#ifndef IS_PY3K
Py_XDECREF(str);
#endif
exit_normalize_decref_unistr:
Py_XDECREF(unistr);
return 0;
}
static PyObject *py_normalized_tokens(PyObject *self, PyObject *args)
{
PyObject *arg1;
uint64_t string_options = LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS;
uint64_t token_options = LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS;
uint32_t arg_whitespace = 0;
PyObject *result = NULL;
if (!PyArg_ParseTuple(args, "O|KKI:normalize", &arg1, &string_options, &token_options, &arg_whitespace)) {
return 0;
}
PyObject *unistr = PyUnicode_FromObject(arg1);
if (unistr == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be converted to unicode in scanner");
return 0;
}
#ifdef IS_PY3K
// Python 3 encoding, supported by Python 3.3+
char *input = PyUnicode_AsUTF8(unistr);
#else
// Python 2 encoding
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
if (str == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be utf-8 encoded");
goto exit_normalized_tokens_decref_str;
}
char *input = PyBytes_AsString(str);
#endif
if (input == NULL) {
goto exit_normalized_tokens_decref_str;
}
bool whitespace = arg_whitespace;
size_t num_tokens;
libpostal_normalized_token_t *normalized_tokens = libpostal_normalized_tokens(input, string_options, token_options, whitespace, &num_tokens);
if (normalized_tokens == NULL) {
goto exit_normalized_tokens_decref_str;
}
result = PyList_New((Py_ssize_t)num_tokens);
if (!result) {
goto exit_free_normalized_tokens;
}
for (size_t i = 0; i < num_tokens; i++) {
libpostal_normalized_token_t normalized_token = normalized_tokens[i];
char *token_str = normalized_token.str;
PyObject *py_token = PyUnicode_DecodeUTF8((const char *)token_str, strlen(token_str), "strict");
if (py_token == NULL) {
Py_DECREF(result);
goto exit_free_normalized_tokens;
}
PyObject *t = PyTuple_New(2);
PyObject *py_token_type = PyInt_FromLong(normalized_token.token.type);
PyTuple_SetItem(t, 0, py_token);
PyTuple_SetItem(t, 1, py_token_type);
// Note: PyList_SetItem steals a reference, so don't worry about DECREF
PyList_SetItem(result, (Py_ssize_t)i, t);
}
for (size_t i = 0; i < num_tokens; i++) {
free(normalized_tokens[i].str);
}
free(normalized_tokens);
#ifndef IS_PY3K
Py_XDECREF(str);
#endif
Py_XDECREF(unistr);
return result;
exit_free_normalized_tokens:
for (size_t i = 0; i < num_tokens; i++) {
free(normalized_tokens[i].str);
}
free(normalized_tokens);
exit_normalized_tokens_decref_str:
#ifndef IS_PY3K
Py_XDECREF(str);
#endif
exit_normalized_tokens_decref_unistr:
Py_XDECREF(unistr);
return 0;
}
static PyMethodDef normalize_methods[] = {
{"normalize_string", (PyCFunction)py_normalize_string, METH_VARARGS, "normalize_string(input, options)"},
{"normalized_tokens", (PyCFunction)py_normalized_tokens, METH_VARARGS, "normalize_token(input, string_options, token_options, whitespace)"},
{NULL, NULL},
};
#ifdef IS_PY3K
static int normalize_traverse(PyObject *m, visitproc visit, void *arg) {
Py_VISIT(GETSTATE(m)->error);
return 0;
}
static int normalize_clear(PyObject *m) {
Py_CLEAR(GETSTATE(m)->error);
return 0;
}
static struct PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"_normalize",
NULL,
sizeof(struct module_state),
normalize_methods,
NULL,
normalize_traverse,
normalize_clear,
NULL
};
#define INITERROR return NULL
PyObject *
PyInit_normalize(void) {
#else
#define INITERROR return
void
init_normalize(void) {
#endif
#ifdef IS_PY3K
PyObject *module = PyModule_Create(&module_def);
#else
PyObject *module = Py_InitModule("_normalize", normalize_methods);
#endif
if (module == NULL)
INITERROR;
struct module_state *st = GETSTATE(module);
st->error = PyErr_NewException("_normalize.Error", NULL, NULL);
if (st->error == NULL) {
Py_DECREF(module);
INITERROR;
}
if (!libpostal_setup()) {
PyErr_SetString(PyExc_RuntimeError,
"Could not load libpostal");
Py_DECREF(module);
INITERROR;
}
PyModule_AddObject(module, "NORMALIZE_STRING_LATIN_ASCII", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII));
PyModule_AddObject(module, "NORMALIZE_STRING_TRANSLITERATE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE));
PyModule_AddObject(module, "NORMALIZE_STRING_STRIP_ACCENTS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS));
PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE));
PyModule_AddObject(module, "NORMALIZE_STRING_COMPOSE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_COMPOSE));
PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_LOWERCASE));
PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_TRIM));
PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS));
PyModule_AddObject(module, "NORMALIZE_STRING_SIMPLE_LATIN_ASCII", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII));
PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS));
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS));
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_FINAL_PERIOD", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD));
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS));
PyModule_AddObject(module, "NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES));
PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE));
PyModule_AddObject(module, "NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC));
PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_DIGITS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS));
PyModule_AddObject(module, "NORMALIZE_DEFAULT_STRING_OPTIONS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS));
PyModule_AddObject(module, "NORMALIZE_DEFAULT_TOKEN_OPTIONS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS));
PyModule_AddObject(module, "NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS));
PyModule_AddObject(module, "NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC));
#if PY_MAJOR_VERSION >= 3
return module;
#endif
}

View File

@@ -0,0 +1,167 @@
#include <Python.h>
#include <libpostal/libpostal.h>
#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif
struct module_state {
PyObject *error;
};
#ifdef IS_PY3K
#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m))
#else
#define GETSTATE(m) (&_state)
static struct module_state _state;
#endif
static PyObject *py_tokenize(PyObject *self, PyObject *args)
{
PyObject *arg1;
uint32_t arg_whitespace = 0;
if (!PyArg_ParseTuple(args, "OI:tokenize", &arg1, &arg_whitespace)) {
return 0;
}
bool whitespace = arg_whitespace;
PyObject *unistr = PyUnicode_FromObject(arg1);
if (unistr == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be converted to unicode in scanner");
return 0;
}
#ifdef IS_PY3K
// Python 3 encoding, supported by Python 3.3+
char *input = PyUnicode_AsUTF8(unistr);
#else
// Python 2 encoding
PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict");
if (str == NULL) {
PyErr_SetString(PyExc_TypeError,
"Parameter could not be utf-8 encoded");
goto error_decref_unistr;
}
char *input = PyBytes_AsString(str);
#endif
if (input == NULL) {
goto error_decref_str;
}
size_t num_tokens;
libpostal_token_t *tokens = libpostal_tokenize(input, whitespace, &num_tokens);
if (tokens == NULL) {
goto error_decref_str;
}
PyObject *result = PyTuple_New(num_tokens);
if (!result) {
free(tokens);
goto error_decref_str;
return 0;
}
PyObject *tuple;
libpostal_token_t token;
for (size_t i = 0; i < num_tokens; i++) {
token = tokens[i];
tuple = Py_BuildValue("III", token.offset, token.len, token.type);
if (PyTuple_SetItem(result, i, tuple) < 0) {
free(tokens);
goto error_decref_str;
}
}
#ifndef IS_PY3K
Py_XDECREF(str);
#endif
Py_XDECREF(unistr);
free(tokens);
return result;
error_decref_str:
#ifndef IS_PY3K
Py_XDECREF(str);
#endif
error_decref_unistr:
Py_XDECREF(unistr);
return 0;
}
static PyMethodDef tokenize_methods[] = {
{"tokenize", (PyCFunction)py_tokenize, METH_VARARGS, "tokenize(text, whitespace)"},
{NULL, NULL},
};
#ifdef IS_PY3K
static int tokenize_traverse(PyObject *m, visitproc visit, void *arg) {
Py_VISIT(GETSTATE(m)->error);
return 0;
}
static int tokenize_clear(PyObject *m) {
Py_CLEAR(GETSTATE(m)->error);
return 0;
}
static struct PyModuleDef module_def = {
PyModuleDef_HEAD_INIT,
"_tokenize",
NULL,
sizeof(struct module_state),
tokenize_methods,
NULL,
tokenize_traverse,
tokenize_clear,
NULL
};
#define INITERROR return NULL
PyObject *
PyInit_tokenize(void) {
#else
#define INITERROR return
void
init_tokenize(void) {
#endif
#ifdef IS_PY3K
PyObject *module = PyModule_Create(&module_def);
#else
PyObject *module = Py_InitModule("_tokenize", tokenize_methods);
#endif
if (module == NULL)
INITERROR;
struct module_state *st = GETSTATE(module);
st->error = PyErr_NewException("_tokenize.Error", NULL, NULL);
if (st->error == NULL) {
Py_DECREF(module);
INITERROR;
}
#if PY_MAJOR_VERSION >= 3
return module;
#endif
}

View File

@@ -0,0 +1,104 @@
from geodata.enum import Enum, EnumValue
class token_types(Enum):
# Word types
WORD = EnumValue(1)
ABBREVIATION = EnumValue(2)
IDEOGRAPHIC_CHAR = EnumValue(3)
HANGUL_SYLLABLE = EnumValue(4)
ACRONYM = EnumValue(5)
# Special tokens
EMAIL = EnumValue(20)
URL = EnumValue(21)
US_PHONE = EnumValue(22)
INTL_PHONE = EnumValue(23)
# Numbers and numeric types
NUMERIC = EnumValue(50)
ORDINAL = EnumValue(51)
ROMAN_NUMERAL = EnumValue(52)
IDEOGRAPHIC_NUMBER = EnumValue(53)
# Punctuation types, may separate a phrase
PERIOD = EnumValue(100)
EXCLAMATION = EnumValue(101)
QUESTION_MARK = EnumValue(102)
COMMA = EnumValue(103)
COLON = EnumValue(104)
SEMICOLON = EnumValue(105)
PLUS = EnumValue(106)
AMPERSAND = EnumValue(107)
AT_SIGN = EnumValue(108)
POUND = EnumValue(109)
ELLIPSIS = EnumValue(110)
DASH = EnumValue(111)
BREAKING_DASH = EnumValue(112)
HYPHEN = EnumValue(113)
PUNCT_OPEN = EnumValue(114)
PUNCT_CLOSE = EnumValue(115)
DOUBLE_QUOTE = EnumValue(119)
SINGLE_QUOTE = EnumValue(120)
OPEN_QUOTE = EnumValue(121)
CLOSE_QUOTE = EnumValue(122)
SLASH = EnumValue(124)
BACKSLASH = EnumValue(125)
GREATER_THAN = EnumValue(126)
LESS_THAN = EnumValue(127)
# Non-letters and whitespace
OTHER = EnumValue(200)
WHITESPACE = EnumValue(300)
NEWLINE = EnumValue(301)
# Phrase, special application-level type not returned by the tokenizer
PHRASE = EnumValue(999)
WORD_TOKEN_TYPES = set([
WORD,
ABBREVIATION,
IDEOGRAPHIC_CHAR,
HANGUL_SYLLABLE,
ACRONYM
])
NUMERIC_TOKEN_TYPES = set([
NUMERIC,
ORDINAL,
ROMAN_NUMERAL,
IDEOGRAPHIC_NUMBER,
])
PUNCTUATION_TOKEN_TYPES = set([
PERIOD,
EXCLAMATION,
QUESTION_MARK,
COMMA,
COLON,
SEMICOLON,
PLUS,
AMPERSAND,
AT_SIGN,
POUND,
ELLIPSIS,
DASH,
BREAKING_DASH,
HYPHEN,
PUNCT_OPEN,
PUNCT_CLOSE,
DOUBLE_QUOTE,
SINGLE_QUOTE,
OPEN_QUOTE,
CLOSE_QUOTE,
SLASH,
BACKSLASH,
GREATER_THAN,
LESS_THAN,
])
NON_ALPHANUMERIC_TOKEN_TYPES = PUNCTUATION_TOKEN_TYPES | set([
OTHER,
WHITESPACE,
NEWLINE,
])

View File

@@ -0,0 +1,11 @@
from geodata.encoding import safe_encode, safe_decode
from geodata.text import _tokenize
from geodata.text.token_types import token_types
def tokenize(s, whitespace=False):
u = safe_decode(s)
s = safe_encode(s)
return [(safe_decode(s[start:start + length]), token_types.from_id(token_type))
for start, length, token_type in _tokenize.tokenize(u, whitespace)]

View File

@@ -0,0 +1,16 @@
import re
from geodata.text.tokenize import tokenize
from geodata.text.token_types import token_types
non_breaking_dash_regex = re.compile(u'[\-\u058a\u05be\u1400\u1806\u2010-\u2013\u2212\u2e17\u2e1a\ufe32\ufe63\uff0d]', re.UNICODE)
def is_numeric(s):
tokens = tokenize(s)
return sum((1 for t, c in tokens if c in token_types.NUMERIC_TOKEN_TYPES)) == len(tokens)
def is_numeric_strict(s):
tokens = tokenize(s)
return sum((1 for t, c in tokens if c == token_types.NUMERIC)) == len(tokens)