From 448ca6a61a1b0b5e2a8f539f3d4d0e475928a49a Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 14 Aug 2017 04:04:58 -0600 Subject: [PATCH 01/89] [merge] merging commit from v1.1 --- scripts/geodata/text/normalize.py | 68 ++-------- scripts/geodata/text/pynormalize.c | 200 ++++++++++++----------------- scripts/geodata/text/pytokenize.c | 31 +++-- scripts/geodata/text/tokenize.py | 7 +- scripts/setup.py | 44 +++---- src/libpostal.c | 70 ++++++++++ src/libpostal.h | 110 ++++++++++++++++ src/normalize.h | 38 +++--- src/token_types.h | 94 +++++++------- src/tokens.h | 6 +- 10 files changed, 374 insertions(+), 294 deletions(-) diff --git a/scripts/geodata/text/normalize.py b/scripts/geodata/text/normalize.py index 87df1227..70a70be0 100644 --- a/scripts/geodata/text/normalize.py +++ b/scripts/geodata/text/normalize.py @@ -2,7 +2,6 @@ import six from geodata.text import _normalize -from geodata.text.tokenize import tokenize_raw from geodata.text.token_types import token_types from geodata.encoding import safe_decode @@ -17,12 +16,7 @@ NORMALIZE_STRING_TRIM = _normalize.NORMALIZE_STRING_TRIM NORMALIZE_STRING_REPLACE_HYPHENS = _normalize.NORMALIZE_STRING_REPLACE_HYPHENS NORMALIZE_STRING_SIMPLE_LATIN_ASCII = _normalize.NORMALIZE_STRING_SIMPLE_LATIN_ASCII -DEFAULT_STRING_OPTIONS = NORMALIZE_STRING_LATIN_ASCII | \ - NORMALIZE_STRING_DECOMPOSE | \ - NORMALIZE_STRING_TRIM | \ - NORMALIZE_STRING_REPLACE_HYPHENS | \ - NORMALIZE_STRING_STRIP_ACCENTS | \ - NORMALIZE_STRING_LOWERCASE +DEFAULT_STRING_OPTIONS = _normalize.NORMALIZE_DEFAULT_STRING_OPTIONS # Token options NORMALIZE_TOKEN_REPLACE_HYPHENS = _normalize.NORMALIZE_TOKEN_REPLACE_HYPHENS @@ -34,16 +28,10 @@ NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE = _normalize.NORMALIZE_TOKEN_DELETE_OTHE NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC = _normalize.NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC NORMALIZE_TOKEN_REPLACE_DIGITS = _normalize.NORMALIZE_TOKEN_REPLACE_DIGITS -DEFAULT_TOKEN_OPTIONS = NORMALIZE_TOKEN_REPLACE_HYPHENS | \ - NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \ - NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | \ - NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | \ - NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE +DEFAULT_TOKEN_OPTIONS = _normalize.NORMALIZE_DEFAULT_TOKEN_OPTIONS -TOKEN_OPTIONS_DROP_PERIODS = NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | \ - NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS - -DEFAULT_TOKEN_OPTIONS_NUMERIC = (DEFAULT_TOKEN_OPTIONS | NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC) +TOKEN_OPTIONS_DROP_PERIODS = _normalize.NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS +DEFAULT_TOKEN_OPTIONS_NUMERIC = _normalize.NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC def remove_parens(tokens): @@ -62,33 +50,7 @@ def remove_parens(tokens): def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS): s = safe_decode(s) - if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII: - normalized = _normalize.normalize_string_latin(s, string_options) - else: - normalized = _normalize.normalize_string_utf8(s, string_options) - - return normalized - - -def normalize_token(s, t, token_options=DEFAULT_TOKEN_OPTIONS): - return _normalize.normalize_token(s, t, token_options) - - -def normalize_tokens_whitespace(s, raw_tokens, token_options=DEFAULT_TOKEN_OPTIONS): - last_end = 0 - tokens = [] - - for t in raw_tokens: - t_norm = _normalize.normalize_token(s, t, token_options) - t_class = token_types.from_id(t[-1]) - - if last_end < t[0]: - tokens.append((six.u(' '), token_types.WHITESPACE)) - last_end = sum(t[:2]) - - tokens.append((t_norm, t_class)) - - return tokens + return _normalize.normalize_string(s, string_options) def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS, @@ -105,20 +67,10 @@ def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS, Usage: normalized_tokens(u'St.-Barthélemy') ''' - normalized = normalize_string(s, string_options=string_options) - - # Tuples of (offset, len, type) - raw_tokens = tokenize_raw(normalized) - tokens = [] - last_end = 0 - - if not whitespace: - tokens = [(_normalize.normalize_token(normalized, t, token_options), - token_types.from_id(t[-1])) for t in raw_tokens] - else: - tokens = normalize_tokens_whitespace(normalized, raw_tokens, token_options=token_options) + s = safe_decode(s) + normalized_tokens = _normalize.normalized_tokens(s, string_options, token_options, whitespace) if strip_parentheticals: - return remove_parens(tokens) - else: - return tokens + normalized_tokens = remove_parens(normalized_tokens) + + return [(s, token_types.from_id(token_type)) for s, token_type in normalized_tokens] diff --git a/scripts/geodata/text/pynormalize.c b/scripts/geodata/text/pynormalize.c index 12f3735b..1ce2df7e 100644 --- a/scripts/geodata/text/pynormalize.c +++ b/scripts/geodata/text/pynormalize.c @@ -1,7 +1,6 @@ #include -#include "src/normalize.h" -#include "src/transliterate.h" +#include #if PY_MAJOR_VERSION >= 3 #define IS_PY3K @@ -19,9 +18,7 @@ struct module_state { static struct module_state _state; #endif - - -static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args) +static PyObject *py_normalize_string(PyObject *self, PyObject *args) { PyObject *arg1; uint64_t options; @@ -48,7 +45,7 @@ static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args) if (str == NULL) { PyErr_SetString(PyExc_TypeError, "Parameter could not be utf-8 encoded"); - goto exit_decref_unistr; + goto exit_normalize_decref_unistr; } char *input = PyBytes_AsString(str); @@ -56,13 +53,13 @@ static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args) #endif if (input == NULL) { - goto exit_decref_str; + goto exit_normalize_decref_str; } - char *normalized = normalize_string_utf8(input, options); + char *normalized = libpostal_normalize_string(input, options); if (normalized == NULL) { - goto exit_decref_str; + goto exit_normalize_decref_str; } PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict"); @@ -70,7 +67,7 @@ static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args) if (result == NULL) { PyErr_SetString(PyExc_ValueError, "Result could not be utf-8 decoded"); - goto exit_decref_str; + goto exit_normalize_decref_str; } #ifndef IS_PY3K @@ -80,21 +77,26 @@ static PyObject *py_normalize_string_utf8(PyObject *self, PyObject *args) return result; -exit_decref_str: +exit_normalize_decref_str: #ifndef IS_PY3K Py_XDECREF(str); #endif -exit_decref_unistr: +exit_normalize_decref_unistr: Py_XDECREF(unistr); return 0; } -static PyObject *py_normalize_string_latin(PyObject *self, PyObject *args) +static PyObject *py_normalized_tokens(PyObject *self, PyObject *args) { PyObject *arg1; - uint64_t options; - if (!PyArg_ParseTuple(args, "OK:normalize", &arg1, &options)) { + uint64_t string_options = LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS; + uint64_t token_options = LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS; + uint32_t arg_whitespace = 0; + + PyObject *result = NULL; + + if (!PyArg_ParseTuple(args, "O|KKI:normalize", &arg1, &string_options, &token_options, &arg_whitespace)) { return 0; } @@ -117,7 +119,7 @@ static PyObject *py_normalize_string_latin(PyObject *self, PyObject *args) if (str == NULL) { PyErr_SetString(PyExc_TypeError, "Parameter could not be utf-8 encoded"); - goto exit_decref_unistr; + goto exit_normalized_tokens_decref_str; } char *input = PyBytes_AsString(str); @@ -125,98 +127,46 @@ static PyObject *py_normalize_string_latin(PyObject *self, PyObject *args) #endif if (input == NULL) { - goto exit_decref_str; + goto exit_normalized_tokens_decref_str; } - char *normalized = normalize_string_latin(input, strlen(input), options); + bool whitespace = arg_whitespace; - PyObject *result = PyUnicode_DecodeUTF8((const char *)normalized, strlen(normalized), "strict"); - free(normalized); - if (result == NULL) { - PyErr_SetString(PyExc_ValueError, - "Result could not be utf-8 decoded"); - goto exit_decref_str; + size_t num_tokens; + libpostal_normalized_token_t *normalized_tokens = libpostal_normalized_tokens(input, string_options, token_options, whitespace, &num_tokens); + + if (normalized_tokens == NULL) { + goto exit_normalized_tokens_decref_str; } - #ifndef IS_PY3K - Py_XDECREF(str); - #endif - Py_XDECREF(unistr); - - return result; - -exit_decref_str: -#ifndef IS_PY3K - Py_XDECREF(str); -#endif -exit_decref_unistr: - Py_XDECREF(unistr); - return 0; -} - - - -static PyObject *py_normalize_token(PyObject *self, PyObject *args) -{ - PyObject *s; - - uint32_t offset; - uint32_t len; - uint16_t type; - - uint64_t options; - if (!PyArg_ParseTuple(args, "O(IIH)K:normalize", &s, &offset, &len, &type, &options)) { - PyErr_SetString(PyExc_TypeError, - "Error parsing arguments"); - return 0; + result = PyList_New((Py_ssize_t)num_tokens); + if (!result) { + goto exit_free_normalized_tokens; } - token_t token = (token_t){(size_t)offset, (size_t)len, type}; - - PyObject *unistr = PyUnicode_FromObject(s); - if (unistr == NULL) { - PyErr_SetString(PyExc_TypeError, - "Parameter could not be converted to unicode in scanner"); - return 0; - } - - #ifdef IS_PY3K - // Python 3 encoding, supported by Python 3.3+ - - char *input = PyUnicode_AsUTF8(unistr); - - #else - // Python 2 encoding - - PyObject *str = PyUnicode_AsEncodedString(unistr, "utf-8", "strict"); - if (str == NULL) { - PyErr_SetString(PyExc_ValueError, - "Parameter could not be utf-8 encoded"); - goto exit_decref_unistr; + for (size_t i = 0; i < num_tokens; i++) { + libpostal_normalized_token_t normalized_token = normalized_tokens[i]; + char *token_str = normalized_token.str; + PyObject *py_token = PyUnicode_DecodeUTF8((const char *)token_str, strlen(token_str), "strict"); + if (py_token == NULL) { + Py_DECREF(result); + goto exit_free_normalized_tokens; } - char *input = PyBytes_AsString(str); + PyObject *t = PyTuple_New(2); + PyObject *py_token_type = PyInt_FromLong(normalized_token.token.type); - #endif + PyTuple_SetItem(t, 0, py_token); + PyTuple_SetItem(t, 1, py_token_type); - if (input == NULL) { - goto exit_decref_str; + // Note: PyList_SetItem steals a reference, so don't worry about DECREF + PyList_SetItem(result, (Py_ssize_t)i, t); } - char_array *token_buffer = char_array_new_size(token.len); - - add_normalized_token(token_buffer, input, token, options); - char *token_str = char_array_get_string(token_buffer); - PyObject *result = PyUnicode_DecodeUTF8((const char *)token_str, token_buffer->n - 1, "strict"); - - if (result == NULL) { - PyErr_SetString(PyExc_ValueError, - "Error decoding token"); - char_array_destroy(token_buffer); - goto exit_decref_str; + for (size_t i = 0; i < num_tokens; i++) { + free(normalized_tokens[i].str); } - - char_array_destroy(token_buffer); + free(normalized_tokens); #ifndef IS_PY3K Py_XDECREF(str); @@ -224,20 +174,24 @@ static PyObject *py_normalize_token(PyObject *self, PyObject *args) Py_XDECREF(unistr); return result; - -exit_decref_str: +exit_free_normalized_tokens: + for (size_t i = 0; i < num_tokens; i++) { + free(normalized_tokens[i].str); + } + free(normalized_tokens); +exit_normalized_tokens_decref_str: #ifndef IS_PY3K Py_XDECREF(str); #endif -exit_decref_unistr: +exit_normalized_tokens_decref_unistr: Py_XDECREF(unistr); return 0; } + static PyMethodDef normalize_methods[] = { - {"normalize_string_utf8", (PyCFunction)py_normalize_string_utf8, METH_VARARGS, "normalize_string_utf8(input, options)"}, - {"normalize_string_latin", (PyCFunction)py_normalize_string_latin, METH_VARARGS, "normalize_string_latin(input, options)"}, - {"normalize_token", (PyCFunction)py_normalize_token, METH_VARARGS, "normalize_token(input, options)"}, + {"normalize_string", (PyCFunction)py_normalize_string, METH_VARARGS, "normalize_string(input, options)"}, + {"normalized_tokens", (PyCFunction)py_normalized_tokens, METH_VARARGS, "normalize_token(input, string_options, token_options, whitespace)"}, {NULL, NULL}, }; @@ -295,32 +249,40 @@ init_normalize(void) { INITERROR; } - if (!transliteration_module_setup(NULL)) { + if (!libpostal_setup()) { PyErr_SetString(PyExc_RuntimeError, - "Could not load transliterate module"); + "Could not load libpostal"); Py_DECREF(module); INITERROR; } - PyModule_AddObject(module, "NORMALIZE_STRING_LATIN_ASCII", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LATIN_ASCII)); - PyModule_AddObject(module, "NORMALIZE_STRING_TRANSLITERATE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRANSLITERATE)); - PyModule_AddObject(module, "NORMALIZE_STRING_STRIP_ACCENTS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_STRIP_ACCENTS)); - PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_DECOMPOSE)); - PyModule_AddObject(module, "NORMALIZE_STRING_COMPOSE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_COMPOSE)); - PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_LOWERCASE)); - PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_TRIM)); - PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_REPLACE_HYPHENS)); - PyModule_AddObject(module, "NORMALIZE_STRING_SIMPLE_LATIN_ASCII", PyLong_FromUnsignedLongLong(NORMALIZE_STRING_SIMPLE_LATIN_ASCII)); + PyModule_AddObject(module, "NORMALIZE_STRING_LATIN_ASCII", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII)); + PyModule_AddObject(module, "NORMALIZE_STRING_TRANSLITERATE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE)); + PyModule_AddObject(module, "NORMALIZE_STRING_STRIP_ACCENTS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS)); + PyModule_AddObject(module, "NORMALIZE_STRING_DECOMPOSE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE)); + PyModule_AddObject(module, "NORMALIZE_STRING_COMPOSE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_COMPOSE)); + PyModule_AddObject(module, "NORMALIZE_STRING_LOWERCASE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_LOWERCASE)); + PyModule_AddObject(module, "NORMALIZE_STRING_TRIM", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_TRIM)); + PyModule_AddObject(module, "NORMALIZE_STRING_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS)); + PyModule_AddObject(module, "NORMALIZE_STRING_SIMPLE_LATIN_ASCII", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_HYPHENS)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_HYPHENS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_HYPHENS)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_FINAL_PERIOD", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_FINAL_PERIOD)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC)); - PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_DIGITS", PyLong_FromUnsignedLongLong(NORMALIZE_TOKEN_REPLACE_DIGITS)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_HYPHENS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_FINAL_PERIOD", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC)); + PyModule_AddObject(module, "NORMALIZE_TOKEN_REPLACE_DIGITS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS)); + + + PyModule_AddObject(module, "NORMALIZE_DEFAULT_STRING_OPTIONS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS)); + PyModule_AddObject(module, "NORMALIZE_DEFAULT_TOKEN_OPTIONS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS)); + + PyModule_AddObject(module, "NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS)); + + PyModule_AddObject(module, "NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC", PyLong_FromUnsignedLongLong(LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC)); #if PY_MAJOR_VERSION >= 3 diff --git a/scripts/geodata/text/pytokenize.c b/scripts/geodata/text/pytokenize.c index 7986bae3..a69a86ea 100644 --- a/scripts/geodata/text/pytokenize.c +++ b/scripts/geodata/text/pytokenize.c @@ -1,6 +1,6 @@ #include -#include "src/scanner.h" +#include #if PY_MAJOR_VERSION >= 3 #define IS_PY3K @@ -18,14 +18,17 @@ struct module_state { static struct module_state _state; #endif - static PyObject *py_tokenize(PyObject *self, PyObject *args) { PyObject *arg1; - if (!PyArg_ParseTuple(args, "O:tokenize", &arg1)) { + uint32_t arg_whitespace = 0; + + if (!PyArg_ParseTuple(args, "OI:tokenize", &arg1, &arg_whitespace)) { return 0; } + bool whitespace = arg_whitespace; + PyObject *unistr = PyUnicode_FromObject(arg1); if (unistr == NULL) { PyErr_SetString(PyExc_TypeError, @@ -57,26 +60,28 @@ static PyObject *py_tokenize(PyObject *self, PyObject *args) goto error_decref_str; } - token_array *tokens = tokenize(input); + size_t num_tokens; + + libpostal_token_t *tokens = libpostal_tokenize(input, whitespace, &num_tokens); if (tokens == NULL) { goto error_decref_str; } - PyObject *result = PyTuple_New(tokens->n); + PyObject *result = PyTuple_New(num_tokens); if (!result) { - token_array_destroy(tokens); + free(tokens); goto error_decref_str; return 0; } PyObject *tuple; - token_t token; - for (size_t i = 0; i < tokens->n; i++) { - token = tokens->a[i]; + libpostal_token_t token; + for (size_t i = 0; i < num_tokens; i++) { + token = tokens[i]; tuple = Py_BuildValue("III", token.offset, token.len, token.type); if (PyTuple_SetItem(result, i, tuple) < 0) { - token_array_destroy(tokens); + free(tokens); goto error_decref_str; } } @@ -86,7 +91,7 @@ static PyObject *py_tokenize(PyObject *self, PyObject *args) #endif Py_XDECREF(unistr); - token_array_destroy(tokens); + free(tokens); return result; @@ -100,12 +105,10 @@ error_decref_unistr: } static PyMethodDef tokenize_methods[] = { - {"tokenize", (PyCFunction)py_tokenize, METH_VARARGS, "tokenize(text)"}, + {"tokenize", (PyCFunction)py_tokenize, METH_VARARGS, "tokenize(text, whitespace)"}, {NULL, NULL}, }; - - #ifdef IS_PY3K static int tokenize_traverse(PyObject *m, visitproc visit, void *arg) { diff --git a/scripts/geodata/text/tokenize.py b/scripts/geodata/text/tokenize.py index d3d18832..a05022bc 100644 --- a/scripts/geodata/text/tokenize.py +++ b/scripts/geodata/text/tokenize.py @@ -3,12 +3,9 @@ from geodata.text import _tokenize from geodata.text.token_types import token_types -def tokenize_raw(s): - return _tokenize.tokenize(safe_decode(s)) - -def tokenize(s): +def tokenize(s, whitespace=False): u = safe_decode(s) s = safe_encode(s) return [(safe_decode(s[start:start + length]), token_types.from_id(token_type)) - for start, length, token_type in _tokenize.tokenize(u)] + for start, length, token_type in _tokenize.tokenize(u, whitespace)] diff --git a/scripts/setup.py b/scripts/setup.py index a25b6b26..6bbf8891 100644 --- a/scripts/setup.py +++ b/scripts/setup.py @@ -2,9 +2,7 @@ import os from setuptools import setup, Extension, find_packages -this_dir = os.path.dirname(__file__) -PROJECT_DIR = os.path.join(this_dir, os.pardir) -SRC_DIR = os.path.join(PROJECT_DIR, 'src') +RESOURCES_DIR = 'resources' def main(): @@ -14,35 +12,29 @@ def main(): packages=find_packages(), ext_modules=[ Extension('geodata.text._tokenize', - sources=[os.path.join(SRC_DIR, f) - for f in ('scanner.c', - 'string_utils.c', - 'tokens.c', - 'utf8proc/utf8proc.c', - ) - ] + ['geodata/text/pytokenize.c'], - include_dirs=[PROJECT_DIR], - extra_compile_args=['-O0', '-std=gnu99', + sources=['geodata/text/pytokenize.c'], + libraries=['postal'], + include_dirs=['/usr/local/include'], + library_dirs=['/usr/local/lib'], + extra_compile_args=['-std=c99', '-Wno-unused-function'], ), Extension('geodata.text._normalize', - sources=[os.path.join(SRC_DIR, f) - for f in ('normalize.c', - 'string_utils.c', - 'utf8proc/utf8proc.c', - 'tokens.c', - 'unicode_scripts.c', - 'transliterate.c', - 'file_utils.c', - 'trie.c', - 'trie_search.c',) - ] + ['geodata/text/pynormalize.c'], - include_dirs=[PROJECT_DIR], - extra_compile_args=['-std=gnu99', '-DHAVE_CONFIG_H', - '-DLIBPOSTAL_DATA_DIR="{}"'.format(os.getenv('LIBPOSTAL_DATA_DIR', os.path.realpath(os.path.join(PROJECT_DIR, 'data')))), + sources=['geodata/text/pynormalize.c'], + libraries=['postal'], + include_dirs=['/usr/local/include'], + library_dirs=['/usr/local/lib'], + extra_compile_args=['-std=c99', '-Wno-unused-function'], ), ], + data_files=[ + (os.path.join(RESOURCES_DIR, os.path.relpath(d, RESOURCES_DIR)), [os.path.join(d, filename) for filename in filenames]) + for d, _, filenames in os.walk(RESOURCES_DIR) + ], + package_data={ + 'geodata': ['**/*.sh'] + }, include_package_data=True, zip_safe=False, url='http://mapzen.com', diff --git a/src/libpostal.c b/src/libpostal.c index d226413e..c969d86c 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -1137,6 +1137,76 @@ bool libpostal_setup_language_classifier_datadir(char *datadir) { return true; } + +libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n) { + token_array *tokens = NULL; + if (!whitespace) { + tokens = tokenize(input); + } else { + tokens = tokenize_keep_whitespace(input); + } + + if (tokens == NULL) { + return NULL; + } + + libpostal_token_t *a = tokens->a; + *n = tokens->n; + free(tokens); + return a; +} + +char *libpostal_normalize_string(char *str, uint64_t options) { + if (options & LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII) { + return normalize_string_latin(str, strlen(str), options); + } else { + return normalize_string_utf8(str, options); + } +} + +libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n) { + if (input == NULL) { + return NULL; + } + char *normalized = libpostal_normalize_string(input, string_options); + if (normalized == NULL) { + return NULL; + } + + token_array *tokens = NULL; + if (!whitespace) { + tokens = tokenize(normalized); + } else { + tokens = tokenize_keep_whitespace(normalized); + } + + if (tokens == NULL || tokens->a == NULL) { + free(normalized); + return NULL; + } + + size_t num_tokens = tokens->n; + token_t *token_array = tokens->a; + char_array *normalized_token = char_array_new_size(strlen(normalized)); + + libpostal_normalized_token_t *result = malloc(sizeof(libpostal_normalized_token_t) * num_tokens); + + for (size_t i = 0; i < num_tokens; i++) { + token_t token = token_array[i]; + char_array_clear(normalized_token); + add_normalized_token(normalized_token, normalized, token, token_options); + char *token_str = strdup(char_array_get_string(normalized_token)); + result[i] = (libpostal_normalized_token_t){token_str, token}; + } + + free(normalized); + token_array_destroy(tokens); + char_array_destroy(normalized_token); + + *n = num_tokens; + return result; +} + bool libpostal_setup_language_classifier(void) { return libpostal_setup_language_classifier_datadir(NULL); } diff --git a/src/libpostal.h b/src/libpostal.h index 3b86dea3..274c6391 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -12,6 +12,67 @@ extern "C" { #define LIBPOSTAL_MAX_LANGUAGE_LEN 4 +// Doing these as #defines so we can duplicate the values exactly in Python + + +typedef enum { + LIBPOSTAL_TOKEN_TYPE_END = 0, // Null byte + // Word types + LIBPOSTAL_TOKEN_TYPE_WORD = 1, // Any letter-only word (includes all unicode letters) + LIBPOSTAL_TOKEN_TYPE_ABBREVIATION = 2, // Loose abbreviations (roughly anything containing a "." as we don't care about sentences in addresses) + LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_CHAR = 3, // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character + LIBPOSTAL_TOKEN_TYPE_HANGUL_SYLLABLE = 4, // Hangul syllable sequences which contain more than one codepoint + LIBPOSTAL_TOKEN_TYPE_ACRONYM = 5, // Specifically things like U.N. where we may delete internal periods + + LIBPOSTAL_TOKEN_TYPE_PHRASE = 10, // Not part of the first stage tokenizer, but may be used after phrase parsing + + // Special tokens + LIBPOSTAL_TOKEN_TYPE_EMAIL = 20, // Make sure emails are tokenized altogether + LIBPOSTAL_TOKEN_TYPE_URL = 21, // Make sure urls are tokenized altogether + LIBPOSTAL_TOKEN_TYPE_US_PHONE = 22, // US phone number (with or without country code) + LIBPOSTAL_TOKEN_TYPE_INTL_PHONE = 23, // A non-US phone number (must have country code) + + // Numbers and numeric types + LIBPOSTAL_TOKEN_TYPE_NUMERIC = 50, // Any sequence containing a digit + LIBPOSTAL_TOKEN_TYPE_ORDINAL = 51, // 1st, 2nd, 1er, 1 etc. + LIBPOSTAL_TOKEN_TYPE_ROMAN_NUMERAL = 52, // II, III, VI, etc. + LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_NUMBER = 53, // All numeric ideographic characters, includes e.g. Han numbers and chars like "²" + + // Punctuation types, may separate a phrase + LIBPOSTAL_TOKEN_TYPE_PERIOD = 100, + LIBPOSTAL_TOKEN_TYPE_EXCLAMATION = 101, + LIBPOSTAL_TOKEN_TYPE_QUESTION_MARK = 102, + LIBPOSTAL_TOKEN_TYPE_COMMA = 103, + LIBPOSTAL_TOKEN_TYPE_COLON = 104, + LIBPOSTAL_TOKEN_TYPE_SEMICOLON = 105, + LIBPOSTAL_TOKEN_TYPE_PLUS = 106, + LIBPOSTAL_TOKEN_TYPE_AMPERSAND = 107, + LIBPOSTAL_TOKEN_TYPE_AT_SIGN = 108, + LIBPOSTAL_TOKEN_TYPE_POUND = 109, + LIBPOSTAL_TOKEN_TYPE_ELLIPSIS = 110, + LIBPOSTAL_TOKEN_TYPE_DASH = 111, + LIBPOSTAL_TOKEN_TYPE_BREAKING_DASH = 112, + LIBPOSTAL_TOKEN_TYPE_HYPHEN = 113, + LIBPOSTAL_TOKEN_TYPE_PUNCT_OPEN = 114, + LIBPOSTAL_TOKEN_TYPE_PUNCT_CLOSE = 115, + LIBPOSTAL_TOKEN_TYPE_DOUBLE_QUOTE = 119, + LIBPOSTAL_TOKEN_TYPE_SINGLE_QUOTE = 120, + LIBPOSTAL_TOKEN_TYPE_OPEN_QUOTE = 121, + LIBPOSTAL_TOKEN_TYPE_CLOSE_QUOTE = 122, + LIBPOSTAL_TOKEN_TYPE_SLASH = 124, + LIBPOSTAL_TOKEN_TYPE_BACKSLASH = 125, + LIBPOSTAL_TOKEN_TYPE_GREATER_THAN = 126, + LIBPOSTAL_TOKEN_TYPE_LESS_THAN = 127, + + // Non-letters and whitespace + LIBPOSTAL_TOKEN_TYPE_OTHER = 200, + LIBPOSTAL_TOKEN_TYPE_WHITESPACE = 300, + LIBPOSTAL_TOKEN_TYPE_NEWLINE = 301, + + LIBPOSTAL_TOKEN_TYPE_INVALID_CHAR = 500 +} libpostal_token_type_t; + + /* Address dictionaries */ @@ -99,6 +160,55 @@ bool libpostal_setup_parser(void); bool libpostal_setup_parser_datadir(char *datadir); void libpostal_teardown_parser(void); +typedef struct libpostal_token { + size_t offset; + size_t len; + uint16_t type; +} libpostal_token_t; + +libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n); + +// Normalize string options +#define LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII 1 << 0 +#define LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE 1 << 1 +#define LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS 1 << 2 +#define LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE 1 << 3 +#define LIBPOSTAL_NORMALIZE_STRING_LOWERCASE 1 << 4 +#define LIBPOSTAL_NORMALIZE_STRING_TRIM 1 << 5 +#define LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6 +#define LIBPOSTAL_NORMALIZE_STRING_COMPOSE 1 << 7 +#define LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8 +#define LIBPOSTAL_NORMALIZE_STRING_REPLACE_NUMEX 1 << 9 + +// Normalize token options +#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0 +#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1 +#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD 1 << 2 +#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS 1 << 3 +#define LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4 +#define LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5 +#define LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6 +#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS 1 << 7 +#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS 1 << 8 + +#define LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS (LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII | LIBPOSTAL_NORMALIZE_STRING_COMPOSE | LIBPOSTAL_NORMALIZE_STRING_TRIM | LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS | LIBPOSTAL_NORMALIZE_STRING_LOWERCASE) + +#define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS (LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS | LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE) + +#define LIBPOSTAL_NORMALIZE_TOKEN_OPTIONS_DROP_PERIODS (LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS) + +#define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC (LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS | LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC) + +char *libpostal_normalize_string(char *input, uint64_t options); + + +typedef struct libpostal_normalized_token { + char *str; + libpostal_token_t token; +} libpostal_normalized_token_t; + +libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n); + bool libpostal_setup_language_classifier(void); bool libpostal_setup_language_classifier_datadir(char *datadir); void libpostal_teardown_language_classifier(void); diff --git a/src/normalize.h b/src/normalize.h index d485f67f..755b7cee 100644 --- a/src/normalize.h +++ b/src/normalize.h @@ -30,6 +30,7 @@ As well as normalizations for individual string tokens: #include "constants.h" #include "klib/khash.h" +#include "libpostal.h" #include "string_utils.h" #include "utf8proc/utf8proc.h" #include "unicode_scripts.h" @@ -39,25 +40,26 @@ As well as normalizations for individual string tokens: #include "tokens.h" #include "vector.h" -#define NORMALIZE_STRING_LATIN_ASCII 1 << 0 -#define NORMALIZE_STRING_TRANSLITERATE 1 << 1 -#define NORMALIZE_STRING_STRIP_ACCENTS 1 << 2 -#define NORMALIZE_STRING_DECOMPOSE 1 << 3 -#define NORMALIZE_STRING_LOWERCASE 1 << 4 -#define NORMALIZE_STRING_TRIM 1 << 5 -#define NORMALIZE_STRING_REPLACE_HYPHENS 1 << 6 -#define NORMALIZE_STRING_COMPOSE 1 << 7 -#define NORMALIZE_STRING_SIMPLE_LATIN_ASCII 1 << 8 -#define NORMALIZE_STRING_REPLACE_NUMEX 1 << 9 +#define NORMALIZE_STRING_LATIN_ASCII LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII +#define NORMALIZE_STRING_TRANSLITERATE LIBPOSTAL_NORMALIZE_STRING_TRANSLITERATE +#define NORMALIZE_STRING_STRIP_ACCENTS LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS +#define NORMALIZE_STRING_DECOMPOSE LIBPOSTAL_NORMALIZE_STRING_DECOMPOSE +#define NORMALIZE_STRING_LOWERCASE LIBPOSTAL_NORMALIZE_STRING_LOWERCASE +#define NORMALIZE_STRING_TRIM LIBPOSTAL_NORMALIZE_STRING_TRIM +#define NORMALIZE_STRING_REPLACE_HYPHENS LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS +#define NORMALIZE_STRING_COMPOSE LIBPOSTAL_NORMALIZE_STRING_COMPOSE +#define NORMALIZE_STRING_SIMPLE_LATIN_ASCII LIBPOSTAL_NORMALIZE_STRING_SIMPLE_LATIN_ASCII +#define NORMALIZE_STRING_REPLACE_NUMEX LIBPOSTAL_NORMALIZE_STRING_REPLACE_NUMEX -#define NORMALIZE_TOKEN_REPLACE_HYPHENS 1 << 0 -#define NORMALIZE_TOKEN_DELETE_HYPHENS 1 << 1 -#define NORMALIZE_TOKEN_DELETE_FINAL_PERIOD 1 << 2 -#define NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS 1 << 3 -#define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES 1 << 4 -#define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE 1 << 5 -#define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6 -#define NORMALIZE_TOKEN_REPLACE_DIGITS 1 << 7 +#define NORMALIZE_TOKEN_REPLACE_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_HYPHENS +#define NORMALIZE_TOKEN_DELETE_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_DELETE_HYPHENS +#define NORMALIZE_TOKEN_DELETE_FINAL_PERIOD LIBPOSTAL_NORMALIZE_TOKEN_DELETE_FINAL_PERIOD +#define NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS LIBPOSTAL_NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS +#define NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES LIBPOSTAL_NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES +#define NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE LIBPOSTAL_NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE +#define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC +#define NORMALIZE_TOKEN_REPLACE_DIGITS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS +#define NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS // Replace digits with capital D e.g. 10013 => DDDDD, intended for use with lowercased strings #define DIGIT_CHAR "D" diff --git a/src/token_types.h b/src/token_types.h index d746ae89..31cc2ba9 100644 --- a/src/token_types.h +++ b/src/token_types.h @@ -1,64 +1,60 @@ #ifndef TOKEN_TYPES_H #define TOKEN_TYPES_H +#include "libpostal.h" + // Doing these as #defines so we can duplicate the values exactly in Python -#define END 0 // Null byte -// Word types -#define WORD 1 // Any letter-only word (includes all unicode letters) -#define ABBREVIATION 2 // Loose abbreviations (roughly anything containing a "." as we don't care about sentences in addresses) -#define IDEOGRAPHIC_CHAR 3 // For languages that don't separate on whitespace (e.g. Chinese, Japanese, Korean), separate by character -#define HANGUL_SYLLABLE 4 // Hangul syllable sequences which contain more than one codepoint -#define ACRONYM 5 // Specifically things like U.N. where we may delete internal periods +#define END LIBPOSTAL_TOKEN_TYPE_END -#define PHRASE 10 // Not part of the first stage tokenizer, but may be used after phrase parsing +#define WORD LIBPOSTAL_TOKEN_TYPE_WORD +#define ABBREVIATION LIBPOSTAL_TOKEN_TYPE_ABBREVIATION +#define IDEOGRAPHIC_CHAR LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_CHAR +#define HANGUL_SYLLABLE LIBPOSTAL_TOKEN_TYPE_HANGUL_SYLLABLE +#define ACRONYM LIBPOSTAL_TOKEN_TYPE_ACRONYM +#define PHRASE LIBPOSTAL_TOKEN_TYPE_PHRASE -// Special tokens -#define EMAIL 20 // Make sure emails are tokenized altogether -#define URL 21 // Make sure urls are tokenized altogether -#define US_PHONE 22 // US phone number (with or without country code) -#define INTL_PHONE 23 // A non-US phone number (must have country code) +#define EMAIL LIBPOSTAL_TOKEN_TYPE_EMAIL +#define URL LIBPOSTAL_TOKEN_TYPE_URL +#define US_PHONE LIBPOSTAL_TOKEN_TYPE_US_PHONE +#define INTL_PHONE LIBPOSTAL_TOKEN_TYPE_INTL_PHONE -// Numbers and numeric types -#define NUMERIC 50 // Any sequence containing a digit -#define ORDINAL 51 // 1st, 2nd, 1er, 1 etc. -#define ROMAN_NUMERAL 52 // II, III, VI, etc. -#define IDEOGRAPHIC_NUMBER 53 // All numeric ideographic characters, includes e.g. Han numbers and chars like "²" +#define NUMERIC LIBPOSTAL_TOKEN_TYPE_NUMERIC +#define ORDINAL LIBPOSTAL_TOKEN_TYPE_ORDINAL +#define ROMAN_NUMERAL LIBPOSTAL_TOKEN_TYPE_ROMAN_NUMERAL +#define IDEOGRAPHIC_NUMBER LIBPOSTAL_TOKEN_TYPE_IDEOGRAPHIC_NUMBER +#define PERIOD LIBPOSTAL_TOKEN_TYPE_PERIOD +#define EXCLAMATION LIBPOSTAL_TOKEN_TYPE_EXCLAMATION +#define QUESTION_MARK LIBPOSTAL_TOKEN_TYPE_QUESTION_MARK +#define COMMA LIBPOSTAL_TOKEN_TYPE_COMMA +#define COLON LIBPOSTAL_TOKEN_TYPE_COLON +#define SEMICOLON LIBPOSTAL_TOKEN_TYPE_SEMICOLON +#define PLUS LIBPOSTAL_TOKEN_TYPE_PLUS +#define AMPERSAND LIBPOSTAL_TOKEN_TYPE_AMPERSAND +#define AT_SIGN LIBPOSTAL_TOKEN_TYPE_AT_SIGN +#define POUND LIBPOSTAL_TOKEN_TYPE_POUND +#define ELLIPSIS LIBPOSTAL_TOKEN_TYPE_ELLIPSIS +#define DASH LIBPOSTAL_TOKEN_TYPE_DASH +#define BREAKING_DASH LIBPOSTAL_TOKEN_TYPE_BREAKING_DASH +#define HYPHEN LIBPOSTAL_TOKEN_TYPE_HYPHEN +#define PUNCT_OPEN LIBPOSTAL_TOKEN_TYPE_PUNCT_OPEN +#define PUNCT_CLOSE LIBPOSTAL_TOKEN_TYPE_PUNCT_CLOSE +#define DOUBLE_QUOTE LIBPOSTAL_TOKEN_TYPE_DOUBLE_QUOTE +#define SINGLE_QUOTE LIBPOSTAL_TOKEN_TYPE_SINGLE_QUOTE +#define OPEN_QUOTE LIBPOSTAL_TOKEN_TYPE_OPEN_QUOTE +#define CLOSE_QUOTE LIBPOSTAL_TOKEN_TYPE_CLOSE_QUOTE +#define SLASH LIBPOSTAL_TOKEN_TYPE_SLASH +#define BACKSLASH LIBPOSTAL_TOKEN_TYPE_BACKSLASH +#define GREATER_THAN LIBPOSTAL_TOKEN_TYPE_GREATER_THAN +#define LESS_THAN LIBPOSTAL_TOKEN_TYPE_LESS_THAN -// Punctuation types, may separate a phrase -#define PERIOD 100 -#define EXCLAMATION 101 -#define QUESTION_MARK 102 -#define COMMA 103 -#define COLON 104 -#define SEMICOLON 105 -#define PLUS 106 -#define AMPERSAND 107 -#define AT_SIGN 108 -#define POUND 109 -#define ELLIPSIS 110 -#define DASH 111 -#define BREAKING_DASH 112 -#define HYPHEN 113 -#define PUNCT_OPEN 114 -#define PUNCT_CLOSE 115 -#define DOUBLE_QUOTE 119 -#define SINGLE_QUOTE 120 -#define OPEN_QUOTE 121 -#define CLOSE_QUOTE 122 -#define SLASH 124 -#define BACKSLASH 125 -#define GREATER_THAN 126 -#define LESS_THAN 127 +#define OTHER LIBPOSTAL_TOKEN_TYPE_OTHER +#define WHITESPACE LIBPOSTAL_TOKEN_TYPE_WHITESPACE +#define NEWLINE LIBPOSTAL_TOKEN_TYPE_NEWLINE -// Non-letters and whitespace -#define OTHER 200 -#define WHITESPACE 300 -#define NEWLINE 301 - -#define INVALID_CHAR 500 +#define INVALID_CHAR LIBPOSTAL_TOKEN_TYPE_INVALID_CHAR #define is_word_token(type) ((type) == WORD || (type) == ABBREVIATION || (type) == ACRONYM || (type) == IDEOGRAPHIC_CHAR || (type) == HANGUL_SYLLABLE) diff --git a/src/tokens.h b/src/tokens.h index 6b314417..8823a628 100644 --- a/src/tokens.h +++ b/src/tokens.h @@ -12,11 +12,7 @@ #include "token_types.h" #include "vector.h" -typedef struct token { - size_t offset; - size_t len; - uint16_t type; -} token_t; +typedef libpostal_token_t token_t; VECTOR_INIT(token_array, token_t) From f8a808e25426f7c29c6ad8d9420be5e0a219b6d8 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 12 Oct 2017 11:16:53 -0400 Subject: [PATCH 02/89] [utils] adding utf8_len function for strings, and utf8_is_digit --- src/string_utils.c | 32 ++++++++++++++++++++++++++++++++ src/string_utils.h | 2 ++ 2 files changed, 34 insertions(+) diff --git a/src/string_utils.c b/src/string_utils.c index b337de47..b2dc2bbd 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -293,6 +293,10 @@ inline bool utf8_is_letter(int cat) { || cat == UTF8PROC_CATEGORY_LM; } +inline bool utf8_is_digit(int cat) { + return cat == UTF8PROC_CATEGORY_ND; +} + inline bool utf8_is_number(int cat) { return cat == UTF8PROC_CATEGORY_ND || cat == UTF8PROC_CATEGORY_NL || cat == UTF8PROC_CATEGORY_NO; } @@ -336,6 +340,34 @@ inline bool utf8_is_whitespace(int32_t ch) { ; } + +ssize_t utf8_len(const char *str, size_t len) { + if (str == NULL) return -1; + if (len == 0) return 0; + + int32_t ch = 0; + ssize_t num_utf8_chars = 0; + ssize_t char_len; + + uint8_t *ptr = (uint8_t *)str; + + size_t remaining = len; + + while (1) { + char_len = utf8proc_iterate(ptr, -1, &ch); + + if (ch == 0) break; + remaining -= char_len; + if (remaining == 0) break; + + ptr += char_len; + num_utf8_chars += char_len; + } + + return num_utf8_chars; +} + + int utf8_compare_len(const char *str1, const char *str2, size_t len) { if (len == 0) return 0; diff --git a/src/string_utils.h b/src/string_utils.h index 0e7dd235..0cf0382c 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -83,6 +83,8 @@ size_t utf8_common_prefix_len(const char *str1, const char *str2, size_t len); size_t utf8_common_prefix_ignore_separators(const char *str1, const char *str2); size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *str2, size_t len); +ssize_t utf8_len(const char *str, size_t len); + bool utf8_is_hyphen(int32_t ch); bool utf8_is_letter(int cat); bool utf8_is_number(int cat); From 09fbb02042882bdfa54a0ae23cffa6785c4bf6e3 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 14 Oct 2017 01:36:56 -0400 Subject: [PATCH 03/89] [utils] adding utf8_equal_ignore_separators to string utils --- src/string_utils.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++ src/string_utils.h | 2 ++ 2 files changed, 57 insertions(+) diff --git a/src/string_utils.c b/src/string_utils.c index b2dc2bbd..9d27cc37 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -513,6 +513,61 @@ inline size_t utf8_common_prefix_ignore_separators(const char *str1, const char return utf8_common_prefix_len_ignore_separators(str1, str2, strlen(str2)); } +bool utf8_equal_ignore_separators_len(const char *str1, const char *str2, size_t len) { + if (len == 0) return false; + + int32_t c1 = -1, c2 = -1; + ssize_t len1, len2; + + uint8_t *ptr1 = (uint8_t *)str1; + uint8_t *ptr2 = (uint8_t *)str2; + + size_t remaining = len; + + while (1) { + len1 = utf8proc_iterate(ptr1, -1, &c1); + len2 = utf8proc_iterate(ptr2, -1, &c2); + + if (len1 < 0 && len2 < 0 && *ptr1 == *ptr2) { + ptr1++; + ptr2++; + remaining--; + if (remaining == 0) return true; + continue; + } + + if (c1 != 0 && c2 != 0 && c1 == c2) { + ptr1 += len1; + ptr2 += len2; + remaining -= len1; + } else if (utf8_is_hyphen(c1) || utf8_is_separator(utf8proc_category(c1))) { + ptr1 += len1; + if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) { + ptr2 += len2; + } + remaining -= len1; + } else if (utf8_is_hyphen(c2) || utf8_is_separator(utf8proc_category(c2))) { + ptr2 += len2; + remaining -= len2; + } else { + break; + } + + if (remaining == 0) return true; + + } + + return false; +} + +inline bool utf8_equal_ignore_separators(const char *str1, const char *str2) { + size_t len1 = strlen(str1); + size_t len2 = strlen(str2); + size_t len = len1 > len2 ? len1 : len2; + + return utf8_equal_ignore_separators_len(str1, str2, len); +} + bool string_is_digit(char *str, size_t len) { uint8_t *ptr = (uint8_t *)str; size_t idx = 0; diff --git a/src/string_utils.h b/src/string_utils.h index 0cf0382c..852f1813 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -83,6 +83,8 @@ size_t utf8_common_prefix_len(const char *str1, const char *str2, size_t len); size_t utf8_common_prefix_ignore_separators(const char *str1, const char *str2); size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *str2, size_t len); +bool utf8_equal_ignore_separators(const char *str1, const char *str2); + ssize_t utf8_len(const char *str, size_t len); bool utf8_is_hyphen(int32_t ch); From 2f2d3da7220e4c64e41680f9998007c1b50b4616 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 14 Oct 2017 01:42:08 -0400 Subject: [PATCH 04/89] [test] test for utf8_equal_ignore_separators --- test/test_string_utils.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/test_string_utils.c b/test/test_string_utils.c index 7ded5a4e..1fbf310b 100644 --- a/test/test_string_utils.c +++ b/test/test_string_utils.c @@ -60,6 +60,26 @@ TEST test_utf8_compare_ignore_separators(void) { PASS(); } +TEST test_utf8_equal_ignore_separators(void) { + char *str1 = "Bünderstraße "; + char *str2 = "Bünder-straße"; + + bool equal = utf8_common_prefix_ignore_separators(str1, str2); + ASSERT(equal); + + str1 = " Bünder-straße "; + str2 = "Bünder straße"; + equal = utf8_common_prefix_ignore_separators(str1, str2); + ASSERT(equal); + + str1 = "Bünder-straße-a"; + str2 = "Bünder straße aa"; + equal = utf8_common_prefix_ignore_separators(str1, str2); + ASSERT_FALSE(equal); + + PASS(); +} + TEST test_feature_array_add(void) { cstring_array *features = cstring_array_new(); if (features == NULL) { From 3a3aca8490c9124327b0bb24cf0907ebcac1e5c7 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 18 Oct 2017 03:59:05 -0400 Subject: [PATCH 05/89] [similarity] adding basic double metaphone implementation --- src/double_metaphone.c | 982 +++++++++++++++++++++++++++++++++++++++++ src/double_metaphone.h | 17 + 2 files changed, 999 insertions(+) create mode 100644 src/double_metaphone.c create mode 100644 src/double_metaphone.h diff --git a/src/double_metaphone.c b/src/double_metaphone.c new file mode 100644 index 00000000..e28264e2 --- /dev/null +++ b/src/double_metaphone.c @@ -0,0 +1,982 @@ +#include +#include +#include + +#include "double_metaphone.h" +#include "string_utils.h" +#include "utf8proc/utf8proc.h" + +static bool is_vowel(char c) { + return (c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U' || c == 'Y'); +} + +static char get_char_at(char *str, size_t len, ssize_t idx) { + if (idx < 0 || idx >= len) return 0; + return str[idx]; +} + +static char *get_string_at(char *str, size_t len, ssize_t idx) { + if (idx < 0 || idx >= len) return NULL; + return str + idx; +} + +static inline bool is_slavo_germanic(char *s) { + return strstr(s, "W") + || strstr(s, "K") + || strstr(s, "CZ") + || strstr(s, "WITZ"); +} + +static inline bool substring_equals(char *str, size_t len, ssize_t index, size_t substr_len, size_t nargs, ...) { + char *string_at_index = get_string_at(str, len, index); + if (string_at_index == NULL) return false; + + va_list args; + char *sub; + + va_start(args, nargs); + + bool matched = false; + + for (size_t i = 0; i < nargs; i++) { + sub = va_arg(args, char *); + if (sub == NULL) break; + + if (utf8_compare_len(string_at_index, sub, substr_len) == 0) { + matched = true; + break; + } + } + + va_end(args); + + return matched; + +} + + +double_metaphone_codes_t *double_metaphone(char *input) { + if (input == NULL) return NULL; + + char *ptr = utf8_upper(input); + + char *normalized = (char *)utf8proc_NFD((utf8proc_uint8_t *)ptr); + + if (normalized != NULL) { + free(ptr); + ptr = normalized; + } + + if (ptr == NULL) { + return NULL; + } + + char *str = ptr; + + size_t len = strlen(str); + char_array *primary = char_array_new_size(len + 1); + char_array *secondary = char_array_new_size(len + 1); + + bool slavo_germanic = is_slavo_germanic(str); + + size_t current = 0; + size_t last = len - 1; + + if (substring_equals(str, len, current, 2, 1, "ʻ")) { + str += 2; + } else if (get_char_at(str, len, current) == '\'') { + str++; + } + + if (substring_equals(str, len, current, 2, 5, "GN", "KN", "PN", "WR", "PS")) { + current++; + } else if (get_char_at(str, len, current) == 'X') { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + current++; + } + + while (true) { + char c = *(str + current); + if (c == '\x00') break; + + if (is_vowel(c) && current == 0) { + char_array_append(primary, "A"); + char_array_append(secondary, "A"); + current++; + continue; + } else if (c == 'B') { + /* "-mb", e.g", "dumb", already skipped over... */ + char_array_append(primary, "P"); + char_array_append(secondary, "P"); + + if (get_char_at(str, len, current + 1) == 'B') { + current += 2; + } else { + current++; + } + continue; + // C with cedilla (denormalized) + } else if (substring_equals(str, len, current, 3, 2, "C\xcc\xa7", "Ç")) { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + current += 2; + } else if (c == 'C') { + // various germanic + if ((current > 1) + && !is_vowel(get_char_at(str, len, current - 2)) + && substring_equals(str, len, current - 1, 3, 1, "ACH") + && ((get_char_at(str, len, current + 2) != 'I') + && ((get_char_at(str, len, current + 2) != 'E') + || substring_equals(str, len, current - 2, 6, 2, "BACHER", "MACHER")) + ) + ) + { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + + // special case for "caesar" + if ((current == 0) + && substring_equals(str, len, current, 6, 1, "CAESAR")) + { + char_array_append(primary, "S"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + + // Italian e.g. "chianti" + if (substring_equals(str, len, current, 4, 1, "CHIA")) { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + + if (substring_equals(str, len, current, 2, 1, "CH")) { + // "michael" + if ((current > 0) + && substring_equals(str, len, current, 4, 1, "CHAE")) + { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + + // Greek roots e.g. "chemistry", "chorus" + if ((current == 0) + && (substring_equals(str, len, current + 1, 5, 3, "HARAC", "HARIS", "HOREO") + || substring_equals(str, len, current + 1, 4, 3, "HIRO", "HAOS", "HAOT") + || substring_equals(str, len, current + 1, 3, 5, "HOR", "HYM", "HIA", "HEM", "HIM")) + ) + { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + + // Germanic, Greek, or otherwise "ch" for "kh" sound + if ( + (substring_equals(str, len, 0, 4, 2, "VAN ", "VON ") + || substring_equals(str, len, current - 5, 5, 2, " VAN ", " VON ") + || substring_equals(str, len, 0, 3, 1, "SCH")) + // "ochestra", "orchid", "architect" but not "arch" + || substring_equals(str, len, current - 2, 6, 1, "ORCHES", "ARCHIT", "ORCHID") + || substring_equals(str, len, current + 2, 1, 2, "T", "S") + || ( + ((current == 0) || substring_equals(str, len, current - 1, 1, 4, "A", "O", "U", "E")) + // e.g. "wachtler", "wechsler", but not "tichner" + && substring_equals(str, len, current + 2, 1, 10, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ") + ) + ) + { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; + } else { + if (current > 0) { + if (substring_equals(str, len, 0, 2, 1, "MC")) { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + } else { + char_array_append(primary, "X"); + char_array_append(secondary, "K"); + } + } else { + char_array_append(primary, "X"); + char_array_append(secondary, "K"); + } + } + current += 2; + continue; + } + + // e.g, "czerny" + if (substring_equals(str, len, current, 2, 1, "CZ") + && !substring_equals(str, len, current - 2, 4, 1, "WICZ")) + { + char_array_append(primary, "S"); + char_array_append(secondary, "X"); + current += 2; + continue; + } + + // e.g. "focaccia" + if (substring_equals(str, len, current + 1, 3, 1, "CIA")) { + char_array_append(primary, "X"); + char_array_append(secondary, "X"); + current += 3; + continue; + } + + // double 'C' but not if e.g. "McClellan" + if (substring_equals(str, len, current, 2, 1, "CC") + && !((current == 1) && get_char_at(str, len, 0) == 'M')) + { + // "bellocchio" but not "bacchus" + if (substring_equals(str, len, current + 2, 1, 3, "I", "E", "H") + && !substring_equals(str, len, current + 2, 2, 1, "HU")) + { + // "accident", "accede", "succeed" + if (((current == 1) + && (get_char_at(str, len, current - 1) == 'A')) + || substring_equals(str, len, current - 1, 5, 2, "UCCEE", "UCCES")) + { + char_array_append(primary, "KS"); + char_array_append(secondary, "KS"); + } else { + char_array_append(primary, "X"); + char_array_append(secondary, "X"); + } + current += 3; + continue; + } + } else { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + + if (substring_equals(str, len, current, 2, 3, "CK", "CG", "CQ")) { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + + if (substring_equals(str, len, current, 2, 3, "CI", "CE", "CY")) { + if (substring_equals(str, len, current, 3, 3, "CIO", "CIE", "CIA")) { + char_array_append(primary, "S"); + char_array_append(secondary, "X"); + } else { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + } + current += 2; + continue; + } + + // else + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + + if (substring_equals(str, len, current + 1, 2, 3, " C", " Q", " G")) { + current += 3; + } else if (substring_equals(str, len, current + 1, 1, 3, "C", "K", "Q") + && !substring_equals(str, len, current + 1, 2, 2, "CE", "CI")) + { + current += 2; + } else { + current++; + } + + continue; + } else if (substring_equals(str, len, current, 2, 1, "Đ")) { + char_array_append(primary, "T"); + char_array_append(secondary, "T"); + current += 2; + continue; + } else if (c == 'D') { + if (substring_equals(str, len, current, 2, 1, "DG")) { + if (substring_equals(str, len, current + 2, 1, 3, "I", "E", "Y")) { + // e.g. "edge" + char_array_append(primary, "J"); + char_array_append(secondary, "J"); + current += 3; + continue; + } else { + char_array_append(primary, "TK"); + char_array_append(secondary, "TK"); + current += 2; + continue; + } + } + + if (substring_equals(str, len, current, 2, 2, "DT", "DD")) { + char_array_append(primary, "T"); + char_array_append(secondary, "T"); + current += 2; + continue; + } + + // else + char_array_append(primary, "T"); + char_array_append(secondary, "T"); + current++; + continue; + } else if (c == 'F') { + if (get_char_at(str, len, current + 1) == 'F') { + current += 2; + } else { + current++; + } + + char_array_append(primary, "F"); + char_array_append(secondary, "F"); + continue; + } else if (c == 'G') { + if (get_char_at(str, len, current + 1) == 'H') { + if ((current > 0) && !is_vowel(get_char_at(str, len, current - 1))) { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; + } + + if (current < 3) { + // "ghislane", "ghiradelli" + if (get_char_at(str, len, current + 2) == 'I') { + char_array_append(primary, "J"); + char_array_append(secondary, "J"); + } else { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + } + current += 2; + continue; + } + + // Parker's rule (with some further refinements) - e.g. "hugh" + if ( + ((current > 1) + && substring_equals(str, len, current - 2, 1, 3, "B", "H", "D")) + // e.g. "bough" + || ((current > 2) + && substring_equals(str, len, current - 3, 1, 3, "B", "H", "D")) + // e.g. "broughton" + || ((current > 3) + && substring_equals(str, len, current - 4, 1, 2, "B", "H")) + ) + { + current += 2; + continue; + } else { + // e.g. "laugh", "McLaughlin", "cough", "gough", "rough", "tough" + if ((current > 2) + && (get_char_at(str, len, current - 1) == 'U') + && substring_equals(str, len, current - 3, 1, 5, "C", "G", "L", "R", "T")) + { + char_array_append(primary, "F"); + char_array_append(secondary, "F"); + } else if ((current > 0) + && get_char_at(str, len, current - 1) == 'I') + { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + } + current += 2; + continue; + } + + } + + if (get_char_at(str, len, current + 1) == 'N') { + if ((current == 1) && is_vowel(get_char_at(str, len, 0)) + && slavo_germanic) + { + char_array_append(primary, "KN"); + char_array_append(secondary, "N"); + // not e.g. "cagney" + } else if (!substring_equals(str, len, current + 2, 2, 1, "EY") + && (get_char_at(str, len, current + 1) != 'Y') + && !slavo_germanic) + { + char_array_append(primary, "N"); + char_array_append(secondary, "KN"); + } else { + char_array_append(primary, "KN"); + char_array_append(secondary, "KN"); + } + current += 2; + continue; + } + + // "tagliaro" + if (substring_equals(str, len, current + 1, 2, 1, "LI") + && !slavo_germanic) + { + char_array_append(primary, "KL"); + char_array_append(secondary, "L"); + current += 2; + continue; + } + + // -ges-, -gep-, -gel-, -gie- at beginning + if ((current == 0) + && ((get_char_at(str, len, current + 1) == 'Y') + || substring_equals(str, len, current + 1, 2, 13, "ES", "EP", + "EB", "EL", "EY", "IB", "IL", "IN", "IE", + "EI", "ER"))) + { + char_array_append(primary, "K"); + char_array_append(secondary, "J"); + current += 2; + continue; + } + + // -ger-, -gy- + if ( + (substring_equals(str, len, current + 1, 2, 1, "ER") + || (get_char_at(str, len, current + 1) == 'Y')) + && !substring_equals(str, len, 0, 6, 3, "DANGER", "RANGER", "MANGER") + && !substring_equals(str, len, current - 1, 1, 2, "E", "I") + && !substring_equals(str, len, current - 1, 3, 2, "RGY", "OGY") + ) + { + char_array_append(primary, "K"); + char_array_append(secondary, "J"); + current += 2; + continue; + } + + // italian e.g. "viaggi" + if (substring_equals(str, len, current + 1, 1, 3, "E", "I", "Y") + || substring_equals(str, len, current - 1, 4, 2, "AGGI", "OGGI")) + { + // obvious germanic + if ( + (substring_equals(str, len, 0, 4, 2, "VAN ", "VON ") + || substring_equals(str, len, current - 5, 5, 2, " VAN ", " VON ") + || substring_equals(str, len, 0, 3, 1, "SCH")) + || substring_equals(str, len, current + 1, 2, 1, "ET")) + { + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + } else { + if (substring_equals(str, len, current + 1, 4, 1, "IER ") + || ((current == len - 3) && substring_equals(str, len, current + 1, 3, 1, "IER"))) + { + char_array_append(primary, "J"); + char_array_append(secondary, "J"); + } else { + char_array_append(primary, "J"); + char_array_append(secondary, "K"); + } + current += 2; + continue; + } + } + + if (get_char_at(str, len, current + 1) == 'G') { + current += 2; + } else { + current++; + } + + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + continue; + } else if (c == 'H') { + // only keep if first & before vowel or between 2 vowels + if (((current == 0) || is_vowel(get_char_at(str, len, current - 1))) + && is_vowel(get_char_at(str, len, current + 1))) + { + char_array_append(primary, "H"); + char_array_append(secondary, "H"); + current += 2; + // also takes care of "HH" + } else { + current++; + } + continue; + } else if (c == 'J') { + // obvious Spanish, "Jose", "San Jacinto" + if (substring_equals(str, len, current, 4, 1, "JOSE") + || substring_equals(str, len, current, 5, 1, "JOSÉ") + || substring_equals(str, len, 0, 4, 1, "SAN ")) + { + if (((current == 0) + && (get_char_at(str, len, current + 4) == ' ')) + || substring_equals(str, len, 0, 4, 1, "SAN ")) + { + char_array_append(primary, "H"); + char_array_append(secondary, "H"); + } else { + char_array_append(primary, "J"); + char_array_append(secondary, "H"); + } + + current++; + continue; + } + + if ((current == 0) + && !substring_equals(str, len, current, 4, 1, "JOSE") + && !substring_equals(str, len, current, 5, 1, "JOSÉ")) + { + // Yankelovich/Jankelowicz + char_array_append(primary, "J"); + char_array_append(secondary, "A"); + current++; + continue; + } else { + // Spanish pronoun of e.g. "bajador" + if (is_vowel(get_char_at(str, len, current - 1)) + && !slavo_germanic + && ((get_char_at(str, len, current + 1) == 'A') + || (get_char_at(str, len, current + 1) == 'O'))) + { + char_array_append(primary, "J"); + char_array_append(secondary, "H"); + } else { + if (current == last) { + char_array_append(primary, "J"); + } else { + if (!substring_equals(str, len, current + 1, 1, 8, "L", "T", + "K", "S", "N", "M", "B", "Z") + && !substring_equals(str, len, current - 1, 1, 3, "S", "K", "L")) + { + char_array_append(primary, "J"); + char_array_append(secondary, "J"); + } + } + } + + + // it could happen! + if (get_char_at(str, len, current + 1) == 'J') { + current += 2; + } else { + current++; + } + continue; + } + } else if (c == 'K') { + if (get_char_at(str, len, current + 1) == 'K') { + current += 2; + } else { + current++; + } + + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + continue; + } else if (substring_equals(str, len, current, 2, 1, "Ł")) { + current += 2; + char_array_append(primary, "L"); + char_array_append(secondary, "L"); + continue; + } else if (c == 'L') { + if (get_char_at(str, len, current + 1) == 'L') { + // Spanish e.g. "Cabrillo", "Gallegos" + if (((current == (len - 3)) + && substring_equals(str, len, current - 1, 4, 3, "ILLO", "ILLA", "ALLE")) + || ((substring_equals(str, len, last - 1, 2, 2, "AS", "OS") + || substring_equals(str, len, last, 1, 2, "A", "O")) + && substring_equals(str, len, current - 1, 4, 1, "ALLE") + ) + ) + { + char_array_append(primary, "L"); + current += 2; + continue; + } + + current += 2; + } else { + current++; + } + char_array_append(primary, "L"); + char_array_append(secondary, "L"); + continue; + } else if (c == 'M') { + if ((substring_equals(str, len, current - 1, 3, 1, "UMB") + && (((current + 1) == last) + || substring_equals(str, len, current + 2, 2, 1, "ER"))) + || (get_char_at(str, len, current + 1) == 'M')) + { + current += 2; + } else { + current++; + } + char_array_append(primary, "M"); + char_array_append(secondary, "M"); + continue; + } else if (c == 'N') { + if (get_char_at(str, len, current + 1) == 'N') { + current += 2; + } else { + current++; + } + + char_array_append(primary, "N"); + char_array_append(secondary, "N"); + continue; + } else if (substring_equals(str, len, current, 2, 1, "Ñ")) { + current += 2; + char_array_append(primary, "N"); + char_array_append(secondary, "N"); + continue; + } else if (c == 'P') { + if (get_char_at(str, len, current + 1) == 'H') { + char_array_append(primary, "F"); + char_array_append(secondary, "F"); + current += 2; + continue; + } + + // also account for "Campbell", "raspberry" + if (substring_equals(str, len, current + 1, 1, 2, "P", "B")) { + current += 2; + } else { + current++; + } + + char_array_append(primary, "P"); + char_array_append(secondary, "P"); + continue; + } else if (c == 'Q') { + if (get_char_at(str, len, current + 1) == 'Q') { + current += 2; + } else { + current += 1; + } + + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + continue; + } else if (c == 'R') { + // french e.g. "rogier", but exclude "hochmeier" + if ((current == last) + && !slavo_germanic + && substring_equals(str, len, current - 2, 2, 1, "IE") + && !substring_equals(str, len, current - 4, 2, 2, "ME", "MA")) + { + char_array_append(secondary, "R"); + } else { + char_array_append(primary, "R"); + char_array_append(secondary, "R"); + } + + if (get_char_at(str, len, current + 1) == 'R') { + current += 2; + } else { + current++; + } + continue; + } else if (c == 'S') { + // special cases "island", "isle", "carlisle", "carlysle" + if (substring_equals(str, len, current - 1, 3, 2, "ISL", "YSL")) { + current++; + continue; + } + + // special case "sugar-" + if ((current == 0) + && substring_equals(str, len, current, 5, 1, "SUGAR")) + { + char_array_append(primary, "X"); + char_array_append(secondary, "S"); + current++; + continue; + } + + if (substring_equals(str, len, current, 2, 1, "SH")) { + // Germanic + if (substring_equals(str, len, current + 1, 4, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + } else { + char_array_append(primary, "X"); + char_array_append(secondary, "X"); + } + current += 2; + continue; + } + + // Italian & Armenian + if (substring_equals(str, len, current, 3, 2, "SIO", "SIA") + || substring_equals(str, len, current, 4, 1, "SIAN")) + { + if (!slavo_germanic) { + char_array_append(primary, "S"); + char_array_append(secondary, "X"); + } else { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + } + current += 3; + continue; + } + + /* German & Anglicisations, e.g. "Smith" match "Schmidt", "Snider" match "Schneider" + also, -sz- in Slavic language although in Hungarian it is pronounced 's' */ + if (((current == 0) + && substring_equals(str, len, current + 1, 1, 4, "M", "N", "L", "W")) + || substring_equals(str, len, current + 1, 1, 1, "Z")) + { + char_array_append(primary, "S"); + char_array_append(secondary, "X"); + if (substring_equals(str, len, current + 1, 1, 1, "Z")) { + current += 2; + } else { + current++; + } + continue; + } + + + if (substring_equals(str, len, current, 2, 1, "SC")) { + // Schlesinger's rule + if (get_char_at(str, len, current + 2) == 'H') { + // Dutch origin e.g. "school", "schooner" + if (substring_equals(str, len, current + 3, 2, 6, "OO", "ER", "EN", + "UY", "ED", "EM")) + { + // "Schermerhorn", "Schenker" + if (substring_equals(str, len, current + 3, 2, 2, "ER", "EN")) { + char_array_append(primary, "X"); + char_array_append(secondary, "SK"); + } else { + char_array_append(primary, "SK"); + char_array_append(secondary, "SK"); + } + current += 3; + continue; + } else { + if ((current == 0) && !is_vowel(get_char_at(str, len, 3)) + && (get_char_at(str, len, 3) != 'W')) + { + char_array_append(primary, "X"); + char_array_append(secondary, "S"); + } else { + char_array_append(primary, "X"); + char_array_append(secondary, "X"); + } + current += 3; + continue; + } + + if (substring_equals(str, len, current + 2, 1, 3, "I", "E", "Y")) { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + current += 3; + continue; + } + + char_array_append(primary, "SK"); + char_array_append(secondary, "SK"); + current += 3; + continue; + } + } + + // French e.g. "resnais", "artois" + if ((current == last) + && substring_equals(str, len, current - 2, 2, 2, "AI", "OI")) + { + char_array_append(secondary, "S"); + } else { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + } + + if (substring_equals(str, len, current - 1, 1, 2, "S", "Z")) { + current += 2; + } else { + current++; + } + continue; + } else if (c == 'T') { + + if (substring_equals(str, len, current, 4, 1, "TION")) { + char_array_append(primary, "X"); + char_array_append(secondary, "X"); + current += 3; + continue; + } + + if (substring_equals(str, len, current, 3, 2, "TIA", "TCH")) { + char_array_append(primary, "X"); + char_array_append(secondary, "X"); + current += 3; + continue; + } + + if (substring_equals(str, len, current, 2, 1, "TH") + || substring_equals(str, len, current, 3, 1, "TTH")) + { + // special case "Thomas", "Thames", or Germanic + if (substring_equals(str, len, current + 2, 2, 2, "OM", "AM") + || substring_equals(str, len, 0, 4, 2, "VAN ", "VON ") + || substring_equals(str, len, current - 5, 5, 2, " VAN ", " VON ") + || substring_equals(str, len, 0, 3, 1, "SCH")) + { + char_array_append(primary, "T"); + char_array_append(secondary, "T"); + } else { + // yes, zero + char_array_append(primary, "0"); + char_array_append(secondary, "T"); + } + + current += 2; + continue; + } + + if (substring_equals(str, len, current + 1, 1, 2, "T", "D")) { + current += 2; + } else { + current++; + } + + char_array_append(primary, "T"); + char_array_append(secondary, "T"); + continue; + } else if (c == 'V') { + if (get_char_at(str, len, current + 1) == 'V') { + current += 2; + } else { + current++; + } + + char_array_append(primary, "F"); + char_array_append(secondary, "F"); + continue; + } else if (c == 'W') { + // can also be in the middle of word + if (substring_equals(str, len, current, 2, 1, "WR")) { + char_array_append(primary, "R"); + char_array_append(secondary, "R"); + current += 2; + continue; + } + + if ((current == 0) + && (is_vowel(get_char_at(str, len, current + 1)) + || substring_equals(str, len, current, 2, 1, "WH"))) + { + // Wasserman should match Vasserman + if (is_vowel(get_char_at(str, len, current + 1))) { + char_array_append(primary, "A"); + char_array_append(secondary, "F"); + } else { + // need Uomo to match Womo + char_array_append(primary, "A"); + char_array_append(secondary, "A"); + } + } + + // Arnow should match Arnoff + if (((current == last) && is_vowel(get_char_at(str, len, current - 1))) + || substring_equals(str, len, current - 1, 5, 4, "EWSKI", "EWSKY", + "OWSKI", "OWSKY") + || substring_equals(str, len, 0, 3, 1, "SCH")) + { + char_array_append(secondary, "F"); + current++; + continue; + } + + // Polish e.g. "Filipowicz" + if (substring_equals(str, len, current, 4, 2, "WICZ", "WITZ")) { + char_array_append(primary, "TS"); + char_array_append(secondary, "FX"); + current += 4; + continue; + } + + // else skip it + current++; + continue; + } else if (c == 'X') { + // French e.g. "breaux" + if (!((current == last) + && (substring_equals(str, len, current - 3, 3, 2, "IAU", "EAU") + || substring_equals(str, len, current - 2, 2, 2, "AU", "OU")))) + { + char_array_append(primary, "KS"); + char_array_append(secondary, "KS"); + } + + if (substring_equals(str, len, current + 1, 1, 2, "C", "X")) { + current += 2; + } else { + current++; + } + continue; + } else if (c == 'Z') { + // Chinese Pinyin e.g. "Zhao" + if (get_char_at(str, len, current + 1) == 'H') { + char_array_append(primary, "J"); + char_array_append(secondary, "J"); + current += 2; + continue; + } else if (substring_equals(str, len, current + 1, 2, 3, "ZO", "ZI", "ZA") + || (slavo_germanic + && ((current > 0) + && get_char_at(str, len, current - 1) != 'T'))) + { + char_array_append(primary, "S"); + char_array_append(secondary, "TS"); + } else { + char_array_append(primary, "S"); + char_array_append(secondary, "S"); + } + + if (get_char_at(str, len, current + 1) == 'Z') { + current += 2; + } else { + current++; + } + continue; + } else { + current++; + } + } + + double_metaphone_codes_t *codes = calloc(1, sizeof(double_metaphone_codes_t)); + if (codes == NULL) { + char_array_destroy(primary); + char_array_destroy(secondary); + return NULL; + } + + codes->primary = char_array_to_string(primary); + codes->secondary = char_array_to_string(secondary); + + free(ptr); + + return codes; +} + +void double_metaphone_codes_destroy(double_metaphone_codes_t *codes) { + if (codes != NULL) { + if (codes->primary != NULL) { + free(codes->primary); + } + + if (codes->secondary != NULL) { + free(codes->secondary); + } + + free(codes); + } +} \ No newline at end of file diff --git a/src/double_metaphone.h b/src/double_metaphone.h new file mode 100644 index 00000000..64dac8a7 --- /dev/null +++ b/src/double_metaphone.h @@ -0,0 +1,17 @@ +#ifndef DOUBLE_METAPHONE__H +#define DOUBLE_METAPHONE__H + +#include +#include + +typedef struct double_metaphone_codes { + char *primary; + char *secondary; +} double_metaphone_codes_t; + +double_metaphone_codes_t *double_metaphone(char *input); + +void double_metaphone_codes_destroy(double_metaphone_codes_t *codes); + +#endif + From c61007388bde7cb92782747fe83ec80e97c7a244 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 18 Oct 2017 04:00:57 -0400 Subject: [PATCH 06/89] [similarity] bug fixes and additional French, Spanish, Italian, and Slavic phonetics --- src/double_metaphone.c | 107 +++++++++++++++++++++-------------------- 1 file changed, 54 insertions(+), 53 deletions(-) diff --git a/src/double_metaphone.c b/src/double_metaphone.c index e28264e2..d911b22c 100644 --- a/src/double_metaphone.c +++ b/src/double_metaphone.c @@ -54,12 +54,20 @@ static inline bool substring_equals(char *str, size_t len, ssize_t index, size_t } - double_metaphone_codes_t *double_metaphone(char *input) { if (input == NULL) return NULL; char *ptr = utf8_upper(input); + /* Note: NFD normalization will help with simple decomposable accent characters + like "É", "Ü", etc. which effectively become "E\u0301" and "U\u0308". It does + not handle characters like "Ł". For these, use Latin-ASCII transliteration + prior to calling this function. + + We can still check for a specific accented character like C with cedilla (Ç), + by comparing with its decomposed form i.e. "C\xcc\xa7" + */ + char *normalized = (char *)utf8proc_NFD((utf8proc_uint8_t *)ptr); if (normalized != NULL) { @@ -100,7 +108,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { char c = *(str + current); if (c == '\x00') break; - if (is_vowel(c) && current == 0) { + if (current == 0 && is_vowel(c)) { char_array_append(primary, "A"); char_array_append(secondary, "A"); current++; @@ -116,8 +124,8 @@ double_metaphone_codes_t *double_metaphone(char *input) { current++; } continue; - // C with cedilla (denormalized) - } else if (substring_equals(str, len, current, 3, 2, "C\xcc\xa7", "Ç")) { + // Ç - C with cedilla (denormalized) + } else if (substring_equals(str, len, current, 3, 1, "C\xcc\xa7")) { char_array_append(primary, "S"); char_array_append(secondary, "S"); current += 2; @@ -125,7 +133,8 @@ double_metaphone_codes_t *double_metaphone(char *input) { // various germanic if ((current > 1) && !is_vowel(get_char_at(str, len, current - 2)) - && substring_equals(str, len, current - 1, 3, 1, "ACH") + && (substring_equals(str, len, current - 1, 3, 1, "ACH") + && !substring_equals(str, len, current + 2, 1, 3, "O", "A", "U")) && ((get_char_at(str, len, current + 2) != 'I') && ((get_char_at(str, len, current + 2) != 'E') || substring_equals(str, len, current - 2, 6, 2, "BACHER", "MACHER")) @@ -162,7 +171,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { && substring_equals(str, len, current, 4, 1, "CHAE")) { char_array_append(primary, "K"); - char_array_append(secondary, "K"); + char_array_append(secondary, "X"); current += 2; continue; } @@ -171,7 +180,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { if ((current == 0) && (substring_equals(str, len, current + 1, 5, 3, "HARAC", "HARIS", "HOREO") || substring_equals(str, len, current + 1, 4, 3, "HIRO", "HAOS", "HAOT") - || substring_equals(str, len, current + 1, 3, 5, "HOR", "HYM", "HIA", "HEM", "HIM")) + || (substring_equals(str, len, current + 1, 3, 5, "HOR", "HYM", "HIA", "HEM", "HIM") && !substring_equals(str, len, current + 1, 5, 2, "HEMIN"))) ) { char_array_append(primary, "K"); @@ -186,19 +195,21 @@ double_metaphone_codes_t *double_metaphone(char *input) { || substring_equals(str, len, current - 5, 5, 2, " VAN ", " VON ") || substring_equals(str, len, 0, 3, 1, "SCH")) // "ochestra", "orchid", "architect" but not "arch" - || substring_equals(str, len, current - 2, 6, 1, "ORCHES", "ARCHIT", "ORCHID") + || substring_equals(str, len, current - 2, 6, 3, "ORCHES", "ARCHIT", "ORCHID") || substring_equals(str, len, current + 2, 1, 2, "T", "S") || ( - ((current == 0) || substring_equals(str, len, current - 1, 1, 4, "A", "O", "U", "E")) + (((current == 0) || substring_equals(str, len, current - 1, 1, 4, "A", "O", "U", "E")) + // e.g. not "breach", "broach", "pouch", "beech", etc. + && !substring_equals(str, len, current - 2, 2, 6, "EA", "OU", "EE", "OA", "OO", "AU") + // e.g. not "lunch", "birch", "gulch" + && !substring_equals(str, len, current - 1, 1, 3, "L", "R", "N")) // e.g. "wachtler", "wechsler", but not "tichner" - && substring_equals(str, len, current + 2, 1, 10, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ") + && ((current + 1 == last) || substring_equals(str, len, current + 2, 1, 10, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ")) ) ) { char_array_append(primary, "K"); char_array_append(secondary, "K"); - current += 2; - continue; } else { if (current > 0) { if (substring_equals(str, len, 0, 2, 1, "MC")) { @@ -210,7 +221,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { } } else { char_array_append(primary, "X"); - char_array_append(secondary, "K"); + char_array_append(secondary, "X"); } } current += 2; @@ -227,21 +238,13 @@ double_metaphone_codes_t *double_metaphone(char *input) { continue; } - // e.g. "focaccia" - if (substring_equals(str, len, current + 1, 3, 1, "CIA")) { - char_array_append(primary, "X"); - char_array_append(secondary, "X"); - current += 3; - continue; - } - // double 'C' but not if e.g. "McClellan" if (substring_equals(str, len, current, 2, 1, "CC") && !((current == 1) && get_char_at(str, len, 0) == 'M')) { // "bellocchio" but not "bacchus" if (substring_equals(str, len, current + 2, 1, 3, "I", "E", "H") - && !substring_equals(str, len, current + 2, 2, 1, "HU")) + && !substring_equals(str, len, current + 2, 3, 4, "HUS", "HUM", "HUN", "HAN")) { // "accident", "accede", "succeed" if (((current == 1) @@ -250,18 +253,24 @@ double_metaphone_codes_t *double_metaphone(char *input) { { char_array_append(primary, "KS"); char_array_append(secondary, "KS"); + // "pinocchio" but not "riccio" or "picchu" + } else if (get_char_at(str, len, current + 2) == 'H' + && !substring_equals(str, len, current + 2, 2, 2, "HU", "HA")) { + char_array_append(primary, "K"); + char_array_append(secondary, "X"); } else { char_array_append(primary, "X"); char_array_append(secondary, "X"); } current += 3; continue; + } else { + // Pierce's rule + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; } - } else { - char_array_append(primary, "K"); - char_array_append(secondary, "K"); - current += 2; - continue; } if (substring_equals(str, len, current, 2, 3, "CK", "CG", "CQ")) { @@ -271,8 +280,8 @@ double_metaphone_codes_t *double_metaphone(char *input) { continue; } - if (substring_equals(str, len, current, 2, 3, "CI", "CE", "CY")) { - if (substring_equals(str, len, current, 3, 3, "CIO", "CIE", "CIA")) { + if (substring_equals(str, len, current, 2, 4, "CI", "CJ", "CE", "CY")) { + if (substring_equals(str, len, current, 3, 5, "CIO", "CIE", "CIA", "CIU")) { char_array_append(primary, "S"); char_array_append(secondary, "X"); } else { @@ -297,11 +306,6 @@ double_metaphone_codes_t *double_metaphone(char *input) { current++; } - continue; - } else if (substring_equals(str, len, current, 2, 1, "Đ")) { - char_array_append(primary, "T"); - char_array_append(secondary, "T"); - current += 2; continue; } else if (c == 'D') { if (substring_equals(str, len, current, 2, 1, "DG")) { @@ -350,7 +354,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { continue; } - if (current < 3) { + if (current == 0) { // "ghislane", "ghiradelli" if (get_char_at(str, len, current + 2) == 'I') { char_array_append(primary, "J"); @@ -386,7 +390,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(primary, "F"); char_array_append(secondary, "F"); } else if ((current > 0) - && get_char_at(str, len, current - 1) == 'I') + && get_char_at(str, len, current - 1) != 'I') { char_array_append(primary, "K"); char_array_append(secondary, "K"); @@ -399,7 +403,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { if (get_char_at(str, len, current + 1) == 'N') { if ((current == 1) && is_vowel(get_char_at(str, len, 0)) - && slavo_germanic) + && !slavo_germanic) { char_array_append(primary, "KN"); char_array_append(secondary, "N"); @@ -431,7 +435,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { // -ges-, -gep-, -gel-, -gie- at beginning if ((current == 0) && ((get_char_at(str, len, current + 1) == 'Y') - || substring_equals(str, len, current + 1, 2, 13, "ES", "EP", + || substring_equals(str, len, current + 1, 2, 11, "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER"))) { @@ -469,6 +473,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { { char_array_append(primary, "K"); char_array_append(secondary, "K"); + } else { if (substring_equals(str, len, current + 1, 4, 1, "IER ") || ((current == len - 3) && substring_equals(str, len, current + 1, 3, 1, "IER"))) @@ -479,9 +484,9 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(primary, "J"); char_array_append(secondary, "K"); } - current += 2; - continue; } + current += 2; + continue; } if (get_char_at(str, len, current + 1) == 'G') { @@ -546,7 +551,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(primary, "J"); char_array_append(secondary, "H"); } else { - if (current == last) { + if (current == last || ((current == last - 1 || get_char_at(str, len, current + 2) == ' ') && isalpha(get_char_at(str, len, current - 1)) && substring_equals(str, len, current + 1, 1, 2, "A", "O"))) { char_array_append(primary, "J"); } else { if (!substring_equals(str, len, current + 1, 1, 8, "L", "T", @@ -559,7 +564,6 @@ double_metaphone_codes_t *double_metaphone(char *input) { } } - // it could happen! if (get_char_at(str, len, current + 1) == 'J') { current += 2; @@ -578,11 +582,6 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(primary, "K"); char_array_append(secondary, "K"); continue; - } else if (substring_equals(str, len, current, 2, 1, "Ł")) { - current += 2; - char_array_append(primary, "L"); - char_array_append(secondary, "L"); - continue; } else if (c == 'L') { if (get_char_at(str, len, current + 1) == 'L') { // Spanish e.g. "Cabrillo", "Gallegos" @@ -619,6 +618,12 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(primary, "M"); char_array_append(secondary, "M"); continue; + // Ñ (NFD normalized) + } else if (substring_equals(str, len, current, 3, 1, "N\xcc\x83")) { + current += 3; + char_array_append(primary, "N"); + char_array_append(secondary, "N"); + continue; } else if (c == 'N') { if (get_char_at(str, len, current + 1) == 'N') { current += 2; @@ -626,11 +631,6 @@ double_metaphone_codes_t *double_metaphone(char *input) { current++; } - char_array_append(primary, "N"); - char_array_append(secondary, "N"); - continue; - } else if (substring_equals(str, len, current, 2, 1, "Ñ")) { - current += 2; char_array_append(primary, "N"); char_array_append(secondary, "N"); continue; @@ -798,7 +798,8 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(secondary, "S"); } - if (substring_equals(str, len, current - 1, 1, 2, "S", "Z")) { + if (substring_equals(str, len, current + 1, 1, 2, "S", "Z")) { + current += 2; } else { current++; From 245aa226e087fd847947c9fa0c1953e7a12b43eb Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 19 Oct 2017 04:48:50 -0400 Subject: [PATCH 07/89] [utils] function to create an array of uint32_t codepoints from a UTF-8 string, a few bug fixes to string_utils --- src/string_utils.c | 25 ++++++++++++++++++++++++- src/string_utils.h | 4 +++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/src/string_utils.c b/src/string_utils.c index 9d27cc37..f1155001 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -361,12 +361,35 @@ ssize_t utf8_len(const char *str, size_t len) { if (remaining == 0) break; ptr += char_len; - num_utf8_chars += char_len; + num_utf8_chars++; } return num_utf8_chars; } +uint32_array *unicode_codepoints(const char *str) { + if (str == NULL) return NULL; + + uint32_array *a = uint32_array_new(); + + int32_t ch = 0; + ssize_t num_utf8_chars = 0; + ssize_t char_len; + + uint8_t *ptr = (uint8_t *)str; + + while (1) { + char_len = utf8proc_iterate(ptr, -1, &ch); + + if (ch == 0) break; + + uint32_array_push(a, (uint32_t)ch); + ptr += char_len; + } + + return a; +} + int utf8_compare_len(const char *str1, const char *str2, size_t len) { if (len == 0) return 0; diff --git a/src/string_utils.h b/src/string_utils.h index 852f1813..e7760e45 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -74,7 +74,7 @@ ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *ds char *utf8_lower_options(const char *s, utf8proc_option_t options); char *utf8_lower(const char *s); char *utf8_upper_options(const char *s, utf8proc_option_t options); -char *utf8_lower(const char *s); +char *utf8_upper(const char *s); int utf8_compare(const char *str1, const char *str2); int utf8_compare_len(const char *str1, const char *str2, size_t len); @@ -87,6 +87,8 @@ bool utf8_equal_ignore_separators(const char *str1, const char *str2); ssize_t utf8_len(const char *str, size_t len); +uint32_array *unicode_codepoints(const char *str); + bool utf8_is_hyphen(int32_t ch); bool utf8_is_letter(int cat); bool utf8_is_number(int cat); From bd477976d1374f5edcd56e4b27f12c3615b52f9a Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 19 Oct 2017 04:51:28 -0400 Subject: [PATCH 08/89] [similarity] string similarity measures for Damerau-Levenshtein and Jaro-Winkler distances. Both operate on unicode points internally for lengths, etc. instead of byte strings and the Levenshtein distance uses only one array instead of needing to store the full matrix of transitions. --- src/string_similarity.c | 216 ++++++++++++++++++++++++++++++++++++++++ src/string_similarity.h | 18 ++++ 2 files changed, 234 insertions(+) create mode 100644 src/string_similarity.c create mode 100644 src/string_similarity.h diff --git a/src/string_similarity.c b/src/string_similarity.c new file mode 100644 index 00000000..9608498b --- /dev/null +++ b/src/string_similarity.c @@ -0,0 +1,216 @@ +#include "string_similarity.h" +#include "string_utils.h" + + +size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + + size_t num_bytes = (len1 + 1) * sizeof(size_t); + + size_t *column = malloc(num_bytes); + for (size_t y = 1; y <= len1; y++) { + column[y] = y; + } + + size_t transpose_diag = 0; + size_t last_diag = 0; + + for (size_t x = 1; x <= len2; x++) { + column[0] = x; + for (size_t y = 1, last_diag = x - 1; y <= len1; y++) { + size_t old_diag = column[y]; + size_t cost = (u1[y - 1] == u2[x - 1] ? 0 : 1); + + size_t v1 = column[y] + 1; + size_t v2 = column[y - 1] + 1; + size_t v3 = last_diag + cost; + + size_t min = v1; + if (v2 < min) min = v2; + if (v3 < min) min = v3; + + if (x > 1 && y > 1 && u1[y - 1] == u2[x - 2] && u1[y - 2] == u2[x - 1]) { + size_t v4 = transpose_diag + cost; + if (v4 < min) min = v4; + } + + column[y] = min; + + last_diag = old_diag; + } + transpose_diag = last_diag; + } + + size_t dist = column[len1]; + free(column); + return dist; +} + +ssize_t damerau_levenshtein_distance_replace_cost(char *s1, char *s2, size_t replace_cost) { + if (s1 == NULL || s2 == NULL) return -1; + + uint32_array *u1 = unicode_codepoints(s1); + if (u1 == NULL) return -1.0; + + uint32_array *u2 = unicode_codepoints(s2); + + if (u2 == NULL) { + uint32_array_destroy(u1); + return -1.0; + } + + ssize_t lev = damerau_levenshtein_distance_unicode(u1, u2, replace_cost); + + uint32_array_destroy(u1); + uint32_array_destroy(u2); + return lev; +} + +ssize_t damerau_levenshtein_distance(char *s1, char *s2) { + return damerau_levenshtein_distance_replace_cost(s1, s2, 0); +} + +double jaro_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) { + if (u1_array == NULL || u2_array == NULL) return -1.0; + + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + // If both strings are zero-length, return 1. If only one is, return 0 + if (len1 == 0) return len2 == 0 ? 1.0 : 0.0; + + size_t max_len = len1 > len2 ? len1 : len2; + size_t match_distance = (max_len / 2) - 1; + + uint8_t *u1_matches = calloc(len2, sizeof(uint8_t)); + uint8_t *u2_matches = calloc(len1, sizeof(uint8_t)); + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + + double matches = 0.0; + double transpositions = 0.0; + + size_t i = 0; + + // count matches + for (size_t i = 0; i < len1; i++) { + // start and end take into account the match distance + size_t start = i > match_distance ? i - match_distance : 0; + size_t end = (i + match_distance + 1) < len2 ? i + match_distance + 1 : len2; + + for (size_t k = start; k < end; k++) { + // already a match at k + if (u2_matches[k]) continue; + // codepoints not equal + if (u1[i] != u2[k]) continue; + // otherwise record a match on both sides and increment counter + u1_matches[i] = true; + u2_matches[k] = true; + matches++; + break; + } + } + + if (matches == 0) { + free(u1_matches); + free(u2_matches); + return 0.0; + } + + + // count transpositions + size_t k = 0; + for (size_t i = 0; i < len1; i++) { + // wait for a match in u1 + if (!u1_matches[i]) continue; + // get the next matched character in u2 + while (!u2_matches[k]) k++; + // it's a transposition + if (u1[i] != u2[k]) transpositions++; + k++; + } + + // transpositions double-count transposed characters, so divide by 2 + transpositions /= 2.0; + + free(u1_matches); + free(u2_matches); + + // Jaro distance + return ((matches / len1) + + (matches / len2) + + ((matches - transpositions) / matches)) / 3.0; +} + +double jaro_distance(const char *s1, const char *s2) { + if (s1 == NULL || s2 == NULL) { + return -1.0; + } + + uint32_array *u1 = unicode_codepoints(s1); + if (u1 == NULL) return -1.0; + + uint32_array *u2 = unicode_codepoints(s2); + + if (u2 == NULL) { + uint32_array_destroy(u1); + return -1.0; + } + + double jaro = jaro_distance_unicode(u1, u2); + uint32_array_destroy(u1); + uint32_array_destroy(u2); + return jaro; +} + +double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold) { + if (s1 == NULL || s2 == NULL) { + return -1.0; + } + + uint32_array *u1_array = unicode_codepoints(s1); + if (u1_array == NULL) return -1.0; + + uint32_array *u2_array = unicode_codepoints(s2); + + if (u2_array == NULL) { + uint32_array_destroy(u1_array); + return -1.0; + } + + double jaro = jaro_distance_unicode(u1_array, u2_array); + + double j; + + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + + size_t m = len1 < len2 ? len1 : len2; + + size_t i = 0; + for (; i < m; i++) { + if (u1[i] != u2[i]) break; + } + + double jaro_winkler = jaro; + + if (jaro >= bonus_threshold) { + jaro_winkler += (1.0 - jaro_winkler) * i * prefix_scale; + } + + uint32_array_destroy(u1_array); + uint32_array_destroy(u2_array); + + return jaro_winkler > 1.0 ? 1.0 : jaro_winkler; +} + +inline double jaro_winkler_distance(const char *s1, const char *s2) { + return jaro_winkler_distance_prefix_threshold(s1, s2, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD); +} diff --git a/src/string_similarity.h b/src/string_similarity.h new file mode 100644 index 00000000..d5fcf805 --- /dev/null +++ b/src/string_similarity.h @@ -0,0 +1,18 @@ +#ifndef STRING_SIMILARITY_H +#define STRING_SIMILARITY_H + +#include +#include + +#define DEFAULT_JARO_WINKLER_PREFIX_SCALE 0.1 +#define DEFAULT_JARO_WINKLER_BONUS_THRESHOLD 0.7 + +ssize_t damerau_levenshtein_distance(char *s1, char *s2); +ssize_t damerau_levenshtein_distance_replace_cost(char *s1, char *s2, size_t replace_cost); + +double jaro_distance(const char *s1, const char *s2); +double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold); +double jaro_winkler_distance(const char *s1, const char *s2); + + +#endif \ No newline at end of file From 9d2a111286451e9648787170718c1d2f4940730a Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 20 Oct 2017 02:34:30 -0400 Subject: [PATCH 09/89] [numex] when parsing numex, bail on rules in whole_tokens_only languages if there are contiguous rules with no right context rules (example: something that wouldn't make sense like VL in Latin) --- src/numex.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/numex.c b/src/numex.c index 107768fa..b812ca8e 100644 --- a/src/numex.c +++ b/src/numex.c @@ -844,15 +844,21 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { FLOOR_LOG_BASE(rule.value, prev_rule.radix) < FLOOR_LOG_BASE(prev_rule.value, prev_rule.radix)) { result.value += rule.value; log_debug("Last token was RIGHT_CONTEXT_ADD, value=%" PRId64 "\n", result.value); - } else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD && (!whole_tokens_only || complete_token)) { + } else if (prev_rule.rule_type != NUMEX_NULL && rule.rule_type != NUMEX_STOPWORD) { log_debug("Had previous token with no context, finishing previous rule before returning\n"); - - result.len = prev_result_len; - number_finished = true; - advance_index = false; - state = start_state; - rule = prev_rule = NUMEX_NULL_RULE; - prev_result_len = 0; + if (!whole_tokens_only || complete_token) { + result.len = prev_result_len; + number_finished = true; + advance_index = false; + state = start_state; + rule = prev_rule = NUMEX_NULL_RULE; + prev_result_len = 0; + } else { + rule = NUMEX_NULL_RULE; + last_was_separator = false; + state.state = NUMEX_SEARCH_STATE_SKIP_TOKEN; + continue; + } } else if (rule.rule_type != NUMEX_STOPWORD) { result.value = rule.value; log_debug("Got number, result.value=%" PRId64 "\n", result.value); @@ -879,6 +885,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { if (rule.right_context_type == NUMEX_RIGHT_CONTEXT_NONE && !whole_tokens_only) { number_finished = true; } + log_debug("rule is ordinal\n"); } @@ -1037,6 +1044,7 @@ size_t possible_ordinal_digit_len(char *str, size_t len) { int32_t ch; size_t digit_len = 0; + bool seen_first_digit = false; while (idx < len) { ssize_t char_len = utf8proc_iterate(ptr, len, &ch); @@ -1048,10 +1056,14 @@ size_t possible_ordinal_digit_len(char *str, size_t len) { // 0-9 only for this is_digit = ch >= 48 && ch <= 57; - if ((idx == 0 && !is_digit) || (idx > 0 && is_digit && !last_was_digit)) { + if ((seen_first_digit && is_digit && !last_was_digit)) { return 0; } + if (is_digit && !seen_first_digit) { + seen_first_digit = true; + } + if (is_digit) { digit_len += char_len; } From 1c5afcafd294e52fa9ec1296a84b1ebb46e581da Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 20 Oct 2017 02:43:39 -0400 Subject: [PATCH 10/89] [phrases] when skipping/ignoring hyphens in trie search, make sure that the new longer phrase ends at a word boundary (space, hyphen, end of string, etc.) --- src/trie_search.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/trie_search.c b/src/trie_search.c index 8518db89..fa78adf8 100644 --- a/src/trie_search.c +++ b/src/trie_search.c @@ -633,6 +633,8 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u trie_data_node_t data_node; trie_node_t terminal_node; + bool phrase_at_hyphen = false; + while (idx < len) { char_len = utf8proc_iterate(ptr, len, &codepoint); log_debug("char_len = %zu, char=%d\n", char_len, codepoint); @@ -653,7 +655,7 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u for (i = 0; i < char_len; i++) { node_id = trie_get_transition_index(self, last_node, *char_ptr); node = trie_get_node(self, node_id); - log_debug("At idx=%zu, char=%.*s\n", i, (int)char_len, char_ptr); + log_debug("At idx=%u, i=%zu, char=%.*s\n", idx, i, (int)char_len, char_ptr); if (node.check != last_node_id) { log_debug("node.check = %d and last_node_id = %d\n", node.check, last_node_id); @@ -665,7 +667,12 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u } if (is_hyphen && node.check != last_node_id) { - log_debug("No space transition\n"); + log_debug("No space transition, phrase_len=%zu\n", phrase_len); + if (phrase_len > 0 && phrase_len == idx) { + log_debug("phrase_at_hyphen\n"); + phrase_at_hyphen = true; + } + ptr += char_len; idx += char_len; separator_char_len = char_len; @@ -720,10 +727,20 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u log_debug("match_len=%zu\n", match_len); if (tail_match_len == current_tail_len - tail_pos) { + if (phrase_at_hyphen) { + char_len = utf8proc_iterate(ptr + char_len, len, &codepoint); + if (char_len > 0 && utf8proc_codepoint_valid(codepoint)) { + int cat = utf8proc_category(codepoint); + + if (codepoint != 0 && !utf8_is_hyphen(codepoint) && !utf8_is_separator(cat) && !utf8_is_punctuation(cat)) { + return (phrase_t){phrase_start, phrase_len, value}; + } + } + } if (first_char) phrase_start = idx; phrase_len = (uint32_t)(idx + match_len) - phrase_start; - log_debug("tail match! phrase_len=%u\n", phrase_len); + log_debug("tail match! phrase_len=%u, len=%zu\n", phrase_len, len); value = data_node.data; return (phrase_t){phrase_start, phrase_len, value}; } else { From 1fbc238b60e333ce94f7d8356b0442630a407689 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 20 Oct 2017 02:45:32 -0400 Subject: [PATCH 11/89] [numex] adding functions to parse and validate a Roman numeral --- src/numex.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/numex.h | 3 +++ 2 files changed, 57 insertions(+) diff --git a/src/numex.c b/src/numex.c index b812ca8e..908fd8fe 100644 --- a/src/numex.c +++ b/src/numex.c @@ -1160,3 +1160,57 @@ char *replace_numeric_expressions(char *str, char *lang) { return char_array_to_string(replacement); } + + +static inline bool is_roman_numeral_char(char c) { + return (c == 'i' || + c == 'v' || + c == 'x' || + c == 'l' || + c == 'c' || + c == 'd' || + c == 'm' || + c == 'I' || + c == 'V' || + c == 'X' || + c == 'L' || + c == 'C' || + c == 'D' || + c == 'M'); +} + +bool is_valid_roman_numeral(char *str, size_t len) { + char *copy = strndup(str, len); + if (copy == NULL) return false; + + numex_result_array *results = convert_numeric_expressions(copy, LATIN_LANGUAGE_CODE); + if (results == NULL) { + free(copy); + return false; + } + + bool ret = results->n == 1 && results->a[0].len == len; + numex_result_array_destroy(results); + free(copy); + return ret; +} + +bool is_roman_numeral_len(char *str, size_t len) { + size_t i = 0; + bool seen_roman = false; + for (size_t i = 0; i < len; i++) { + char c = *(str + i); + if (c == 0) break; + if (is_roman_numeral_char(c)) { + seen_roman = true; + } else { + return false; + } + } + + return seen_roman && is_valid_roman_numeral(str, len); +} + +inline bool is_roman_numeral(char *str) { + return is_roman_numeral_len(str, strlen(str)); +} diff --git a/src/numex.h b/src/numex.h index d80f96e1..1be25592 100644 --- a/src/numex.h +++ b/src/numex.h @@ -151,6 +151,9 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang); size_t ordinal_suffix_len(char *s, size_t len, char *lang); size_t possible_ordinal_digit_len(char *str, size_t len); +bool is_roman_numeral(char *str); +bool is_roman_numeral_len(char *str, size_t len); + bool numex_table_write(FILE *file); bool numex_table_save(char *filename); From b7eda37e444990d3e4f25d8c3e0ac82b483e9970 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 20 Oct 2017 02:45:55 -0400 Subject: [PATCH 12/89] [utils] adding utf8_is_digit to string_utils.h --- src/string_utils.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/string_utils.h b/src/string_utils.h index e7760e45..5ae041e2 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -92,6 +92,7 @@ uint32_array *unicode_codepoints(const char *str); bool utf8_is_hyphen(int32_t ch); bool utf8_is_letter(int cat); bool utf8_is_number(int cat); +bool utf8_is_digit(int cat); bool utf8_is_letter_or_number(int cat); bool utf8_is_punctuation(int cat); bool utf8_is_symbol(int cat); From 5c927e780fb2edd60902c436e5b46ea660e7cccd Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 20 Oct 2017 02:51:26 -0400 Subject: [PATCH 13/89] [expand] adding ability to expand Roman numerals with ordinal suffixes like IXe in French --- src/libpostal.c | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/src/libpostal.c b/src/libpostal.c index c969d86c..aca879f4 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -774,18 +774,32 @@ static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, to } static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { - size_t token_digit_len = possible_ordinal_digit_len(str + token.offset, token.len); size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); - bool ret = false; + int32_t unichr = 0; + const uint8_t *ptr = (const uint8_t *)str; - if (len_ordinal_suffix == 0 || token_digit_len == 0 || token_digit_len + len_ordinal_suffix < token.len) { - return false; - } else if (len_ordinal_suffix == token.len && i > 0 && prev_token.len > 0) { - size_t prev_token_digit_len = possible_ordinal_digit_len(str + prev_token.offset, prev_token.len); - ret = prev_token_digit_len == prev_token.len; + if (len_ordinal_suffix > 0) { + ssize_t start = 0; + size_t token_offset = token.offset; + size_t token_len = token.len; + + if (len_ordinal_suffix < token.len) { + start = token.offset + token.len - len_ordinal_suffix; + token_offset = token.offset; + token_len = token.len - len_ordinal_suffix; + } else { + start = prev_token.offset + prev_token.len; + token_offset = prev_token.offset; + token_len = prev_token.len; + } + ssize_t prev_char_len = utf8proc_iterate_reversed(ptr, start, &unichr); + if (prev_char_len <= 0) return false; + if (!utf8_is_digit(utf8proc_category(unichr)) && !is_roman_numeral_len(str + token_offset, token_len)) { + return false; + } } else { - ret = true; + return false; } cstring_array *strings = tree->strings; @@ -793,12 +807,10 @@ static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, ch // add_normalized_strings_token won't be called a second time. add_normalized_strings_token(strings, str, token, options); - char_array *key = char_array_new_size(token.len - len_ordinal_suffix + 1); - char_array_cat_len(key, str + token.offset, token.len - len_ordinal_suffix); - char *expansion = char_array_get_string(key); - cstring_array_add_string(strings, expansion); - char_array_destroy(key); - return ret; + token_t normalized_token = token; + normalized_token.len = token.len - len_ordinal_suffix; + add_normalized_strings_token(strings, str, normalized_token, options); + return true; } static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) { From 4ccc2a9e9fa32ff7f19570d1bee5aa2ab6a8a317 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 21 Oct 2017 02:45:08 -0400 Subject: [PATCH 14/89] [fix] making string args const in string_similarity module --- src/string_similarity.c | 4 ++-- src/string_similarity.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/string_similarity.c b/src/string_similarity.c index 9608498b..899b9ec9 100644 --- a/src/string_similarity.c +++ b/src/string_similarity.c @@ -50,7 +50,7 @@ size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array return dist; } -ssize_t damerau_levenshtein_distance_replace_cost(char *s1, char *s2, size_t replace_cost) { +ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost) { if (s1 == NULL || s2 == NULL) return -1; uint32_array *u1 = unicode_codepoints(s1); @@ -70,7 +70,7 @@ ssize_t damerau_levenshtein_distance_replace_cost(char *s1, char *s2, size_t rep return lev; } -ssize_t damerau_levenshtein_distance(char *s1, char *s2) { +ssize_t damerau_levenshtein_distance(const char *s1, const char *s2) { return damerau_levenshtein_distance_replace_cost(s1, s2, 0); } diff --git a/src/string_similarity.h b/src/string_similarity.h index d5fcf805..77f82e8d 100644 --- a/src/string_similarity.h +++ b/src/string_similarity.h @@ -7,12 +7,12 @@ #define DEFAULT_JARO_WINKLER_PREFIX_SCALE 0.1 #define DEFAULT_JARO_WINKLER_BONUS_THRESHOLD 0.7 -ssize_t damerau_levenshtein_distance(char *s1, char *s2); -ssize_t damerau_levenshtein_distance_replace_cost(char *s1, char *s2, size_t replace_cost); +ssize_t damerau_levenshtein_distance(const char *s1, const char *s2); +ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost); double jaro_distance(const char *s1, const char *s2); double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold); double jaro_winkler_distance(const char *s1, const char *s2); -#endif \ No newline at end of file +#endif From 5c0ecf89637ed67d1c6a94de7c5bf0ddf69faf2c Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 21 Oct 2017 10:34:12 -0400 Subject: [PATCH 15/89] [dedupe] Jaccard similarity --- src/jaccard.c | 27 +++++++++++++++++++++++++++ src/jaccard.h | 11 +++++++++++ 2 files changed, 38 insertions(+) create mode 100644 src/jaccard.c create mode 100644 src/jaccard.h diff --git a/src/jaccard.c b/src/jaccard.c new file mode 100644 index 00000000..1f96c61f --- /dev/null +++ b/src/jaccard.c @@ -0,0 +1,27 @@ +#include "jaccard.h" + + +double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2) { + if (s1 == NULL || s2 == NULL) return -1.0; + + size_t set_intersection = 0; + size_t set_union = 0; + + khiter_t k; + const char *key; + + kh_foreach_key(s1, key, { + k = kh_get(str_set, s2, key); + if (k != kh_end(s2)) { + set_intersection++; + } else { + set_union++; + } + }); + + // set_union contains all the keys that were in s1 but not s2 + // so just add all the keys in s2 to complete the union + set_union += kh_size(s2); + + return (double)set_intersection / set_union; +} \ No newline at end of file diff --git a/src/jaccard.h b/src/jaccard.h new file mode 100644 index 00000000..a6468078 --- /dev/null +++ b/src/jaccard.h @@ -0,0 +1,11 @@ +#ifndef JACCARD_H +#define JACCARD_H + +#include +#include + +#include "collections.h" + +double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2); + +#endif \ No newline at end of file From e8ae3bbbafd89b63fbef28544c1bac14c4788adc Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 23 Oct 2017 15:20:04 -0400 Subject: [PATCH 16/89] [similarity] using NULL-terminated varargs in double metaphone instead of specifying the number of arguments, should be more maintainable --- src/double_metaphone.c | 240 ++++++++++++++++++++--------------------- 1 file changed, 119 insertions(+), 121 deletions(-) diff --git a/src/double_metaphone.c b/src/double_metaphone.c index d911b22c..54f03fad 100644 --- a/src/double_metaphone.c +++ b/src/double_metaphone.c @@ -27,19 +27,17 @@ static inline bool is_slavo_germanic(char *s) { || strstr(s, "WITZ"); } -static inline bool substring_equals(char *str, size_t len, ssize_t index, size_t substr_len, size_t nargs, ...) { +static inline bool substring_equals(char *str, size_t len, ssize_t index, size_t substr_len, ...) { char *string_at_index = get_string_at(str, len, index); if (string_at_index == NULL) return false; va_list args; - char *sub; - - va_start(args, nargs); + va_start(args, substr_len); bool matched = false; - for (size_t i = 0; i < nargs; i++) { - sub = va_arg(args, char *); + while (true) { + char *sub = va_arg(args, char *); if (sub == NULL) break; if (utf8_compare_len(string_at_index, sub, substr_len) == 0) { @@ -90,13 +88,13 @@ double_metaphone_codes_t *double_metaphone(char *input) { size_t current = 0; size_t last = len - 1; - if (substring_equals(str, len, current, 2, 1, "ʻ")) { + if (substring_equals(str, len, current, 2, "ʻ", NULL)) { str += 2; } else if (get_char_at(str, len, current) == '\'') { str++; } - if (substring_equals(str, len, current, 2, 5, "GN", "KN", "PN", "WR", "PS")) { + if (substring_equals(str, len, current, 2, "GN", "KN", "PN", "WR", "PS", NULL)) { current++; } else if (get_char_at(str, len, current) == 'X') { char_array_append(primary, "S"); @@ -125,7 +123,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { } continue; // Ç - C with cedilla (denormalized) - } else if (substring_equals(str, len, current, 3, 1, "C\xcc\xa7")) { + } else if (substring_equals(str, len, current, 3, "C\xcc\xa7", NULL)) { char_array_append(primary, "S"); char_array_append(secondary, "S"); current += 2; @@ -133,11 +131,11 @@ double_metaphone_codes_t *double_metaphone(char *input) { // various germanic if ((current > 1) && !is_vowel(get_char_at(str, len, current - 2)) - && (substring_equals(str, len, current - 1, 3, 1, "ACH") - && !substring_equals(str, len, current + 2, 1, 3, "O", "A", "U")) + && (substring_equals(str, len, current - 1, 3, "ACH", NULL) + && !substring_equals(str, len, current + 2, 1, "O", "A", "U", NULL)) && ((get_char_at(str, len, current + 2) != 'I') && ((get_char_at(str, len, current + 2) != 'E') - || substring_equals(str, len, current - 2, 6, 2, "BACHER", "MACHER")) + || substring_equals(str, len, current - 2, 6, "BACHER", "MACHER", NULL)) ) ) { @@ -149,7 +147,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { // special case for "caesar" if ((current == 0) - && substring_equals(str, len, current, 6, 1, "CAESAR")) + && substring_equals(str, len, current, 6, "CAESAR", NULL)) { char_array_append(primary, "S"); char_array_append(secondary, "K"); @@ -158,17 +156,17 @@ double_metaphone_codes_t *double_metaphone(char *input) { } // Italian e.g. "chianti" - if (substring_equals(str, len, current, 4, 1, "CHIA")) { + if (substring_equals(str, len, current, 4, "CHIA", NULL)) { char_array_append(primary, "K"); char_array_append(secondary, "K"); current += 2; continue; } - if (substring_equals(str, len, current, 2, 1, "CH")) { + if (substring_equals(str, len, current, 2, "CH", NULL)) { // "michael" if ((current > 0) - && substring_equals(str, len, current, 4, 1, "CHAE")) + && substring_equals(str, len, current, 4, "CHAE", NULL)) { char_array_append(primary, "K"); char_array_append(secondary, "X"); @@ -178,9 +176,9 @@ double_metaphone_codes_t *double_metaphone(char *input) { // Greek roots e.g. "chemistry", "chorus" if ((current == 0) - && (substring_equals(str, len, current + 1, 5, 3, "HARAC", "HARIS", "HOREO") - || substring_equals(str, len, current + 1, 4, 3, "HIRO", "HAOS", "HAOT") - || (substring_equals(str, len, current + 1, 3, 5, "HOR", "HYM", "HIA", "HEM", "HIM") && !substring_equals(str, len, current + 1, 5, 2, "HEMIN"))) + && (substring_equals(str, len, current + 1, 5, "HARAC", "HARIS", "HOREO", NULL) + || substring_equals(str, len, current + 1, 4, "HIRO", "HAOS", "HAOT", NULL) + || (substring_equals(str, len, current + 1, 3, "HOR", "HYM", "HIA", "HEM", "HIM", NULL) && !substring_equals(str, len, current + 1, 5, "HEMIN", NULL))) ) { char_array_append(primary, "K"); @@ -191,20 +189,20 @@ double_metaphone_codes_t *double_metaphone(char *input) { // Germanic, Greek, or otherwise "ch" for "kh" sound if ( - (substring_equals(str, len, 0, 4, 2, "VAN ", "VON ") - || substring_equals(str, len, current - 5, 5, 2, " VAN ", " VON ") - || substring_equals(str, len, 0, 3, 1, "SCH")) + (substring_equals(str, len, 0, 4, "VAN ", "VON ", NULL) + || substring_equals(str, len, current - 5, 5, " VAN ", " VON ", NULL) + || substring_equals(str, len, 0, 3, "SCH", NULL)) // "ochestra", "orchid", "architect" but not "arch" - || substring_equals(str, len, current - 2, 6, 3, "ORCHES", "ARCHIT", "ORCHID") - || substring_equals(str, len, current + 2, 1, 2, "T", "S") + || substring_equals(str, len, current - 2, 6, "ORCHES", "ARCHIT", "ORCHID", NULL) + || substring_equals(str, len, current + 2, 1, "T", "S", NULL) || ( - (((current == 0) || substring_equals(str, len, current - 1, 1, 4, "A", "O", "U", "E")) + (((current == 0) || substring_equals(str, len, current - 1, 1, "A", "O", "U", "E", NULL)) // e.g. not "breach", "broach", "pouch", "beech", etc. - && !substring_equals(str, len, current - 2, 2, 6, "EA", "OU", "EE", "OA", "OO", "AU") + && !substring_equals(str, len, current - 2, 2, "EA", "OU", "EE", "OA", "OO", "AU", NULL) // e.g. not "lunch", "birch", "gulch" - && !substring_equals(str, len, current - 1, 1, 3, "L", "R", "N")) + && !substring_equals(str, len, current - 1, 1, "L", "R", "N", NULL)) // e.g. "wachtler", "wechsler", but not "tichner" - && ((current + 1 == last) || substring_equals(str, len, current + 2, 1, 10, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ")) + && ((current + 1 == last) || substring_equals(str, len, current + 2, 1, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ", NULL)) ) ) { @@ -212,7 +210,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(secondary, "K"); } else { if (current > 0) { - if (substring_equals(str, len, 0, 2, 1, "MC")) { + if (substring_equals(str, len, 0, 2, "MC", NULL)) { char_array_append(primary, "K"); char_array_append(secondary, "K"); } else { @@ -229,8 +227,8 @@ double_metaphone_codes_t *double_metaphone(char *input) { } // e.g, "czerny" - if (substring_equals(str, len, current, 2, 1, "CZ") - && !substring_equals(str, len, current - 2, 4, 1, "WICZ")) + if (substring_equals(str, len, current, 2, "CZ", NULL) + && !substring_equals(str, len, current - 2, 4, "WICZ", NULL)) { char_array_append(primary, "S"); char_array_append(secondary, "X"); @@ -239,23 +237,23 @@ double_metaphone_codes_t *double_metaphone(char *input) { } // double 'C' but not if e.g. "McClellan" - if (substring_equals(str, len, current, 2, 1, "CC") + if (substring_equals(str, len, current, 2, "CC", NULL) && !((current == 1) && get_char_at(str, len, 0) == 'M')) { // "bellocchio" but not "bacchus" - if (substring_equals(str, len, current + 2, 1, 3, "I", "E", "H") - && !substring_equals(str, len, current + 2, 3, 4, "HUS", "HUM", "HUN", "HAN")) + if (substring_equals(str, len, current + 2, 1, "I", "E", "H", NULL) + && !substring_equals(str, len, current + 2, 3, "HUS", "HUM", "HUN", "HAN", NULL)) { // "accident", "accede", "succeed" if (((current == 1) && (get_char_at(str, len, current - 1) == 'A')) - || substring_equals(str, len, current - 1, 5, 2, "UCCEE", "UCCES")) + || substring_equals(str, len, current - 1, 5, "UCCEE", "UCCES", NULL)) { char_array_append(primary, "KS"); char_array_append(secondary, "KS"); // "pinocchio" but not "riccio" or "picchu" } else if (get_char_at(str, len, current + 2) == 'H' - && !substring_equals(str, len, current + 2, 2, 2, "HU", "HA")) { + && !substring_equals(str, len, current + 2, 2, "HU", "HA", NULL)) { char_array_append(primary, "K"); char_array_append(secondary, "X"); } else { @@ -273,15 +271,15 @@ double_metaphone_codes_t *double_metaphone(char *input) { } } - if (substring_equals(str, len, current, 2, 3, "CK", "CG", "CQ")) { + if (substring_equals(str, len, current, 2, "CK", "CG", "CQ", NULL)) { char_array_append(primary, "K"); char_array_append(secondary, "K"); current += 2; continue; } - if (substring_equals(str, len, current, 2, 4, "CI", "CJ", "CE", "CY")) { - if (substring_equals(str, len, current, 3, 5, "CIO", "CIE", "CIA", "CIU")) { + if (substring_equals(str, len, current, 2, "CI", "CJ", "CE", "CY", NULL)) { + if (substring_equals(str, len, current, 3, "CIO", "CIE", "CIA", "CIU", NULL)) { char_array_append(primary, "S"); char_array_append(secondary, "X"); } else { @@ -296,10 +294,10 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(primary, "K"); char_array_append(secondary, "K"); - if (substring_equals(str, len, current + 1, 2, 3, " C", " Q", " G")) { + if (substring_equals(str, len, current + 1, 2, " C", " Q", " G", NULL)) { current += 3; - } else if (substring_equals(str, len, current + 1, 1, 3, "C", "K", "Q") - && !substring_equals(str, len, current + 1, 2, 2, "CE", "CI")) + } else if (substring_equals(str, len, current + 1, 1, "C", "K", "Q", NULL) + && !substring_equals(str, len, current + 1, 2, "CE", "CI", NULL)) { current += 2; } else { @@ -308,8 +306,8 @@ double_metaphone_codes_t *double_metaphone(char *input) { continue; } else if (c == 'D') { - if (substring_equals(str, len, current, 2, 1, "DG")) { - if (substring_equals(str, len, current + 2, 1, 3, "I", "E", "Y")) { + if (substring_equals(str, len, current, 2, "DG", NULL)) { + if (substring_equals(str, len, current + 2, 1, "I", "E", "Y", NULL)) { // e.g. "edge" char_array_append(primary, "J"); char_array_append(secondary, "J"); @@ -323,7 +321,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { } } - if (substring_equals(str, len, current, 2, 2, "DT", "DD")) { + if (substring_equals(str, len, current, 2, "DT", "DD", NULL)) { char_array_append(primary, "T"); char_array_append(secondary, "T"); current += 2; @@ -370,13 +368,13 @@ double_metaphone_codes_t *double_metaphone(char *input) { // Parker's rule (with some further refinements) - e.g. "hugh" if ( ((current > 1) - && substring_equals(str, len, current - 2, 1, 3, "B", "H", "D")) + && substring_equals(str, len, current - 2, 1, "B", "H", "D", NULL)) // e.g. "bough" || ((current > 2) - && substring_equals(str, len, current - 3, 1, 3, "B", "H", "D")) + && substring_equals(str, len, current - 3, 1, "B", "H", "D", NULL)) // e.g. "broughton" || ((current > 3) - && substring_equals(str, len, current - 4, 1, 2, "B", "H")) + && substring_equals(str, len, current - 4, 1, "B", "H", NULL)) ) { current += 2; @@ -385,7 +383,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { // e.g. "laugh", "McLaughlin", "cough", "gough", "rough", "tough" if ((current > 2) && (get_char_at(str, len, current - 1) == 'U') - && substring_equals(str, len, current - 3, 1, 5, "C", "G", "L", "R", "T")) + && substring_equals(str, len, current - 3, 1, "C", "G", "L", "R", "T", NULL)) { char_array_append(primary, "F"); char_array_append(secondary, "F"); @@ -408,7 +406,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(primary, "KN"); char_array_append(secondary, "N"); // not e.g. "cagney" - } else if (!substring_equals(str, len, current + 2, 2, 1, "EY") + } else if (!substring_equals(str, len, current + 2, 2, "EY", NULL) && (get_char_at(str, len, current + 1) != 'Y') && !slavo_germanic) { @@ -423,7 +421,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { } // "tagliaro" - if (substring_equals(str, len, current + 1, 2, 1, "LI") + if (substring_equals(str, len, current + 1, 2, "LI", NULL) && !slavo_germanic) { char_array_append(primary, "KL"); @@ -435,9 +433,9 @@ double_metaphone_codes_t *double_metaphone(char *input) { // -ges-, -gep-, -gel-, -gie- at beginning if ((current == 0) && ((get_char_at(str, len, current + 1) == 'Y') - || substring_equals(str, len, current + 1, 2, 11, "ES", "EP", + || substring_equals(str, len, current + 1, 2, "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", - "EI", "ER"))) + "EI", "ER", NULL))) { char_array_append(primary, "K"); char_array_append(secondary, "J"); @@ -447,11 +445,11 @@ double_metaphone_codes_t *double_metaphone(char *input) { // -ger-, -gy- if ( - (substring_equals(str, len, current + 1, 2, 1, "ER") + (substring_equals(str, len, current + 1, 2, "ER", NULL) || (get_char_at(str, len, current + 1) == 'Y')) - && !substring_equals(str, len, 0, 6, 3, "DANGER", "RANGER", "MANGER") - && !substring_equals(str, len, current - 1, 1, 2, "E", "I") - && !substring_equals(str, len, current - 1, 3, 2, "RGY", "OGY") + && !substring_equals(str, len, 0, 6, "DANGER", "RANGER", "MANGER", NULL) + && !substring_equals(str, len, current - 1, 1, "E", "I", NULL) + && !substring_equals(str, len, current - 1, 3, "RGY", "OGY", NULL) ) { char_array_append(primary, "K"); @@ -461,22 +459,22 @@ double_metaphone_codes_t *double_metaphone(char *input) { } // italian e.g. "viaggi" - if (substring_equals(str, len, current + 1, 1, 3, "E", "I", "Y") - || substring_equals(str, len, current - 1, 4, 2, "AGGI", "OGGI")) + if (substring_equals(str, len, current + 1, 1, "E", "I", "Y", NULL) + || substring_equals(str, len, current - 1, 4, "AGGI", "OGGI", NULL)) { // obvious germanic if ( - (substring_equals(str, len, 0, 4, 2, "VAN ", "VON ") - || substring_equals(str, len, current - 5, 5, 2, " VAN ", " VON ") - || substring_equals(str, len, 0, 3, 1, "SCH")) - || substring_equals(str, len, current + 1, 2, 1, "ET")) + (substring_equals(str, len, 0, 4, "VAN ", "VON ", NULL) + || substring_equals(str, len, current - 5, 5, " VAN ", " VON ", NULL) + || substring_equals(str, len, 0, 3, "SCH", NULL)) + || substring_equals(str, len, current + 1, 2, "ET", NULL)) { char_array_append(primary, "K"); char_array_append(secondary, "K"); } else { - if (substring_equals(str, len, current + 1, 4, 1, "IER ") - || ((current == len - 3) && substring_equals(str, len, current + 1, 3, 1, "IER"))) + if (substring_equals(str, len, current + 1, 4, "IER ", NULL) + || ((current == len - 3) && substring_equals(str, len, current + 1, 3, "IER", NULL))) { char_array_append(primary, "J"); char_array_append(secondary, "J"); @@ -513,13 +511,13 @@ double_metaphone_codes_t *double_metaphone(char *input) { continue; } else if (c == 'J') { // obvious Spanish, "Jose", "San Jacinto" - if (substring_equals(str, len, current, 4, 1, "JOSE") - || substring_equals(str, len, current, 5, 1, "JOSÉ") - || substring_equals(str, len, 0, 4, 1, "SAN ")) + if (substring_equals(str, len, current, 4, "JOSE", NULL) + || substring_equals(str, len, current, 5, "JOSÉ", NULL) + || substring_equals(str, len, 0, 4, "SAN ", NULL)) { if (((current == 0) && (get_char_at(str, len, current + 4) == ' ')) - || substring_equals(str, len, 0, 4, 1, "SAN ")) + || substring_equals(str, len, 0, 4, "SAN ", NULL)) { char_array_append(primary, "H"); char_array_append(secondary, "H"); @@ -533,8 +531,8 @@ double_metaphone_codes_t *double_metaphone(char *input) { } if ((current == 0) - && !substring_equals(str, len, current, 4, 1, "JOSE") - && !substring_equals(str, len, current, 5, 1, "JOSÉ")) + && !substring_equals(str, len, current, 4, "JOSE", NULL) + && !substring_equals(str, len, current, 5, "JOSÉ", NULL)) { // Yankelovich/Jankelowicz char_array_append(primary, "J"); @@ -551,12 +549,12 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(primary, "J"); char_array_append(secondary, "H"); } else { - if (current == last || ((current == last - 1 || get_char_at(str, len, current + 2) == ' ') && isalpha(get_char_at(str, len, current - 1)) && substring_equals(str, len, current + 1, 1, 2, "A", "O"))) { + if (current == last || ((current == last - 1 || get_char_at(str, len, current + 2) == ' ') && isalpha(get_char_at(str, len, current - 1)) && substring_equals(str, len, current + 1, 1, "A", "O", NULL))) { char_array_append(primary, "J"); } else { - if (!substring_equals(str, len, current + 1, 1, 8, "L", "T", - "K", "S", "N", "M", "B", "Z") - && !substring_equals(str, len, current - 1, 1, 3, "S", "K", "L")) + if (!substring_equals(str, len, current + 1, 1, "L", "T", + "K", "S", "N", "M", "B", "Z", NULL) + && !substring_equals(str, len, current - 1, 1, "S", "K", "L", NULL)) { char_array_append(primary, "J"); char_array_append(secondary, "J"); @@ -586,10 +584,10 @@ double_metaphone_codes_t *double_metaphone(char *input) { if (get_char_at(str, len, current + 1) == 'L') { // Spanish e.g. "Cabrillo", "Gallegos" if (((current == (len - 3)) - && substring_equals(str, len, current - 1, 4, 3, "ILLO", "ILLA", "ALLE")) - || ((substring_equals(str, len, last - 1, 2, 2, "AS", "OS") - || substring_equals(str, len, last, 1, 2, "A", "O")) - && substring_equals(str, len, current - 1, 4, 1, "ALLE") + && substring_equals(str, len, current - 1, 4, "ILLO", "ILLA", "ALLE", NULL)) + || ((substring_equals(str, len, last - 1, 2, "AS", "OS", NULL) + || substring_equals(str, len, last, 1, "A", "O", NULL)) + && substring_equals(str, len, current - 1, 4, "ALLE", NULL) ) ) { @@ -606,9 +604,9 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(secondary, "L"); continue; } else if (c == 'M') { - if ((substring_equals(str, len, current - 1, 3, 1, "UMB") + if ((substring_equals(str, len, current - 1, 3, "UMB", NULL) && (((current + 1) == last) - || substring_equals(str, len, current + 2, 2, 1, "ER"))) + || substring_equals(str, len, current + 2, 2, "ER", NULL))) || (get_char_at(str, len, current + 1) == 'M')) { current += 2; @@ -619,7 +617,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(secondary, "M"); continue; // Ñ (NFD normalized) - } else if (substring_equals(str, len, current, 3, 1, "N\xcc\x83")) { + } else if (substring_equals(str, len, current, 3, "N\xcc\x83", NULL)) { current += 3; char_array_append(primary, "N"); char_array_append(secondary, "N"); @@ -635,7 +633,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(secondary, "N"); continue; } else if (c == 'P') { - if (get_char_at(str, len, current + 1) == 'H') { + if (substring_equals(str, len, current + 1, 1, "H", "F", NULL)) { char_array_append(primary, "F"); char_array_append(secondary, "F"); current += 2; @@ -643,7 +641,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { } // also account for "Campbell", "raspberry" - if (substring_equals(str, len, current + 1, 1, 2, "P", "B")) { + if (substring_equals(str, len, current + 1, 1, "P", "B", NULL)) { current += 2; } else { current++; @@ -666,8 +664,8 @@ double_metaphone_codes_t *double_metaphone(char *input) { // french e.g. "rogier", but exclude "hochmeier" if ((current == last) && !slavo_germanic - && substring_equals(str, len, current - 2, 2, 1, "IE") - && !substring_equals(str, len, current - 4, 2, 2, "ME", "MA")) + && substring_equals(str, len, current - 2, 2, "IE", NULL) + && !substring_equals(str, len, current - 4, 2, "ME", "MA", NULL)) { char_array_append(secondary, "R"); } else { @@ -683,14 +681,14 @@ double_metaphone_codes_t *double_metaphone(char *input) { continue; } else if (c == 'S') { // special cases "island", "isle", "carlisle", "carlysle" - if (substring_equals(str, len, current - 1, 3, 2, "ISL", "YSL")) { + if (substring_equals(str, len, current - 1, 3, "ISL", "YSL", NULL)) { current++; continue; } // special case "sugar-" if ((current == 0) - && substring_equals(str, len, current, 5, 1, "SUGAR")) + && substring_equals(str, len, current, 5, "SUGAR", NULL)) { char_array_append(primary, "X"); char_array_append(secondary, "S"); @@ -698,9 +696,9 @@ double_metaphone_codes_t *double_metaphone(char *input) { continue; } - if (substring_equals(str, len, current, 2, 1, "SH")) { + if (substring_equals(str, len, current, 2, "SH", NULL)) { // Germanic - if (substring_equals(str, len, current + 1, 4, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) { + if (substring_equals(str, len, current + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ", NULL)) { char_array_append(primary, "S"); char_array_append(secondary, "S"); } else { @@ -712,8 +710,8 @@ double_metaphone_codes_t *double_metaphone(char *input) { } // Italian & Armenian - if (substring_equals(str, len, current, 3, 2, "SIO", "SIA") - || substring_equals(str, len, current, 4, 1, "SIAN")) + if (substring_equals(str, len, current, 3, "SIO", "SIA", NULL) + || substring_equals(str, len, current, 4, "SIAN", NULL)) { if (!slavo_germanic) { char_array_append(primary, "S"); @@ -729,12 +727,12 @@ double_metaphone_codes_t *double_metaphone(char *input) { /* German & Anglicisations, e.g. "Smith" match "Schmidt", "Snider" match "Schneider" also, -sz- in Slavic language although in Hungarian it is pronounced 's' */ if (((current == 0) - && substring_equals(str, len, current + 1, 1, 4, "M", "N", "L", "W")) - || substring_equals(str, len, current + 1, 1, 1, "Z")) + && substring_equals(str, len, current + 1, 1, "M", "N", "L", "W", NULL)) + || substring_equals(str, len, current + 1, 1, "Z", NULL)) { char_array_append(primary, "S"); char_array_append(secondary, "X"); - if (substring_equals(str, len, current + 1, 1, 1, "Z")) { + if (substring_equals(str, len, current + 1, 1, "Z", NULL)) { current += 2; } else { current++; @@ -743,15 +741,15 @@ double_metaphone_codes_t *double_metaphone(char *input) { } - if (substring_equals(str, len, current, 2, 1, "SC")) { + if (substring_equals(str, len, current, 2, "SC", NULL)) { // Schlesinger's rule if (get_char_at(str, len, current + 2) == 'H') { // Dutch origin e.g. "school", "schooner" - if (substring_equals(str, len, current + 3, 2, 6, "OO", "ER", "EN", - "UY", "ED", "EM")) + if (substring_equals(str, len, current + 3, 2, "OO", "ER", "EN", + "UY", "ED", "EM", NULL)) { // "Schermerhorn", "Schenker" - if (substring_equals(str, len, current + 3, 2, 2, "ER", "EN")) { + if (substring_equals(str, len, current + 3, 2, "ER", "EN", NULL)) { char_array_append(primary, "X"); char_array_append(secondary, "SK"); } else { @@ -774,7 +772,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { continue; } - if (substring_equals(str, len, current + 2, 1, 3, "I", "E", "Y")) { + if (substring_equals(str, len, current + 2, 1, "I", "E", "Y", NULL)) { char_array_append(primary, "S"); char_array_append(secondary, "S"); current += 3; @@ -790,7 +788,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { // French e.g. "resnais", "artois" if ((current == last) - && substring_equals(str, len, current - 2, 2, 2, "AI", "OI")) + && substring_equals(str, len, current - 2, 2, "AI", "OI", NULL)) { char_array_append(secondary, "S"); } else { @@ -798,7 +796,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(secondary, "S"); } - if (substring_equals(str, len, current + 1, 1, 2, "S", "Z")) { + if (substring_equals(str, len, current + 1, 1, "S", "Z", NULL)) { current += 2; } else { @@ -807,28 +805,28 @@ double_metaphone_codes_t *double_metaphone(char *input) { continue; } else if (c == 'T') { - if (substring_equals(str, len, current, 4, 1, "TION")) { + if (substring_equals(str, len, current, 4, "TION", NULL)) { char_array_append(primary, "X"); char_array_append(secondary, "X"); current += 3; continue; } - if (substring_equals(str, len, current, 3, 2, "TIA", "TCH")) { + if (substring_equals(str, len, current, 3, "TIA", "TCH", NULL)) { char_array_append(primary, "X"); char_array_append(secondary, "X"); current += 3; continue; } - if (substring_equals(str, len, current, 2, 1, "TH") - || substring_equals(str, len, current, 3, 1, "TTH")) + if (substring_equals(str, len, current, 2, "TH", NULL) + || substring_equals(str, len, current, 3, "TTH", NULL)) { // special case "Thomas", "Thames", or Germanic - if (substring_equals(str, len, current + 2, 2, 2, "OM", "AM") - || substring_equals(str, len, 0, 4, 2, "VAN ", "VON ") - || substring_equals(str, len, current - 5, 5, 2, " VAN ", " VON ") - || substring_equals(str, len, 0, 3, 1, "SCH")) + if (substring_equals(str, len, current + 2, 2, "OM", "AM", NULL) + || substring_equals(str, len, 0, 4, "VAN ", "VON ", NULL) + || substring_equals(str, len, current - 5, 5, " VAN ", " VON ", NULL) + || substring_equals(str, len, 0, 3, "SCH", NULL)) { char_array_append(primary, "T"); char_array_append(secondary, "T"); @@ -842,7 +840,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { continue; } - if (substring_equals(str, len, current + 1, 1, 2, "T", "D")) { + if (substring_equals(str, len, current + 1, 1, "T", "D", NULL)) { current += 2; } else { current++; @@ -863,7 +861,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { continue; } else if (c == 'W') { // can also be in the middle of word - if (substring_equals(str, len, current, 2, 1, "WR")) { + if (substring_equals(str, len, current, 2, "WR", NULL)) { char_array_append(primary, "R"); char_array_append(secondary, "R"); current += 2; @@ -872,7 +870,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { if ((current == 0) && (is_vowel(get_char_at(str, len, current + 1)) - || substring_equals(str, len, current, 2, 1, "WH"))) + || substring_equals(str, len, current, 2, "WH", NULL))) { // Wasserman should match Vasserman if (is_vowel(get_char_at(str, len, current + 1))) { @@ -887,9 +885,9 @@ double_metaphone_codes_t *double_metaphone(char *input) { // Arnow should match Arnoff if (((current == last) && is_vowel(get_char_at(str, len, current - 1))) - || substring_equals(str, len, current - 1, 5, 4, "EWSKI", "EWSKY", - "OWSKI", "OWSKY") - || substring_equals(str, len, 0, 3, 1, "SCH")) + || substring_equals(str, len, current - 1, 5, "EWSKI", "EWSKY", + "OWSKI", "OWSKY", NULL) + || substring_equals(str, len, 0, 3, "SCH", NULL)) { char_array_append(secondary, "F"); current++; @@ -897,7 +895,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { } // Polish e.g. "Filipowicz" - if (substring_equals(str, len, current, 4, 2, "WICZ", "WITZ")) { + if (substring_equals(str, len, current, 4, "WICZ", "WITZ", NULL)) { char_array_append(primary, "TS"); char_array_append(secondary, "FX"); current += 4; @@ -910,14 +908,14 @@ double_metaphone_codes_t *double_metaphone(char *input) { } else if (c == 'X') { // French e.g. "breaux" if (!((current == last) - && (substring_equals(str, len, current - 3, 3, 2, "IAU", "EAU") - || substring_equals(str, len, current - 2, 2, 2, "AU", "OU")))) + && (substring_equals(str, len, current - 3, 3, "IAU", "EAU", NULL) + || substring_equals(str, len, current - 2, 2, "AU", "OU", NULL)))) { char_array_append(primary, "KS"); char_array_append(secondary, "KS"); } - if (substring_equals(str, len, current + 1, 1, 2, "C", "X")) { + if (substring_equals(str, len, current + 1, 1, "C", "X", NULL)) { current += 2; } else { current++; @@ -930,7 +928,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(secondary, "J"); current += 2; continue; - } else if (substring_equals(str, len, current + 1, 2, 3, "ZO", "ZI", "ZA") + } else if (substring_equals(str, len, current + 1, 2, "ZO", "ZI", "ZA", NULL) || (slavo_germanic && ((current > 0) && get_char_at(str, len, current - 1) != 'T'))) From e38e57b8e8b30610d4cf0d5439743798f394a2c3 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 27 Oct 2017 04:04:06 -0400 Subject: [PATCH 17/89] [numex] fixing edge case where something like "IV Michael" could cause a partial Roman numeral to get added for the MI portion of "Michael" --- src/numex.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/numex.c b/src/numex.c index 908fd8fe..7629bc2a 100644 --- a/src/numex.c +++ b/src/numex.c @@ -849,6 +849,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) { if (!whole_tokens_only || complete_token) { result.len = prev_result_len; number_finished = true; + complete_token = false; advance_index = false; state = start_state; rule = prev_rule = NUMEX_NULL_RULE; From 6d430f7e9ba230bf409b5cddfb966a62c1101ef0 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 27 Oct 2017 04:07:28 -0400 Subject: [PATCH 18/89] [utils] adding functions for finding the next index of a full stop/period charater in a string --- src/string_utils.c | 39 +++++++++++++++++++++++++++++++++++++++ src/string_utils.h | 9 +++++++++ 2 files changed, 48 insertions(+) diff --git a/src/string_utils.c b/src/string_utils.c index f1155001..6b1b14ab 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -668,6 +668,45 @@ inline bool string_contains_hyphen(char *str) { return string_next_hyphen_index(str, strlen(str)) >= 0; } +ssize_t string_next_codepoint_len(char *str, uint32_t codepoint, size_t len) { + uint8_t *ptr = (uint8_t *)str; + int32_t ch; + ssize_t idx = 0; + + while (idx < len) { + ssize_t char_len = utf8proc_iterate(ptr, len, &ch); + + if (char_len <= 0 || ch == 0) break; + + if ((uint32_t)ch == codepoint) return idx; + ptr += char_len; + idx += char_len; + } + return -1; +} + +ssize_t string_next_codepoint(char *str, uint32_t codepoint) { + return string_next_codepoint_len(str, codepoint, strlen(str)); +} + +#define PERIOD_CODEPOINT 46 + +ssize_t string_next_period_len(char *str, size_t len) { + return string_next_codepoint_len(str, PERIOD_CODEPOINT, len); +} + +ssize_t string_next_period(char *str) { + return string_next_codepoint(str, PERIOD_CODEPOINT); +} + +inline bool string_contains_period_len(char *str, size_t len) { + return string_next_codepoint_len(str, PERIOD_CODEPOINT, len) >= 0; +} + +inline bool string_contains_period(char *str) { + return string_next_codepoint(str, string_next_codepoint(str, PERIOD_CODEPOINT)) >= 0; +} + size_t string_right_spaces_len(char *str, size_t len) { size_t spaces = 0; diff --git a/src/string_utils.h b/src/string_utils.h index 5ae041e2..29683f91 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -106,6 +106,15 @@ ssize_t string_next_hyphen_index(char *str, size_t len); bool string_contains_hyphen(char *str); bool string_contains_hyphen_len(char *str, size_t len); +ssize_t string_next_codepoint_len(char *str, uint32_t codepoint, size_t len); +ssize_t string_next_codepoint(char *str, uint32_t codepoint); + +ssize_t string_next_period_len(char *str, size_t len); +ssize_t string_next_period(char *str); + +bool string_contains_period_len(char *str, size_t len); +bool string_contains_period(char *str); + char *string_trim(char *str); /* char_array is a dynamic character array defined in collections.h From 053dca82ba241547fad4c2b81bfb6bab444a8fd2 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 28 Oct 2017 02:38:15 -0400 Subject: [PATCH 19/89] [expand] adding a normalization for a single non-acronym internal period where there's an expansion at the prefix/suffix (for #218 and https://github.com/openvenues/libpostal/issues/216#issuecomment-306617824). Helps in cases like "St.Michaels" or "Jln.Utara" without needing to specify concatenated prefix phrases for every possibility --- .../id/concatenated_prefixes_separable.txt | 2 - src/libpostal.c | 660 ++++++++++-------- src/libpostal.h | 11 +- src/normalize.c | 12 +- src/normalize.h | 2 + 5 files changed, 402 insertions(+), 285 deletions(-) delete mode 100644 resources/dictionaries/id/concatenated_prefixes_separable.txt diff --git a/resources/dictionaries/id/concatenated_prefixes_separable.txt b/resources/dictionaries/id/concatenated_prefixes_separable.txt deleted file mode 100644 index 3f4d6c59..00000000 --- a/resources/dictionaries/id/concatenated_prefixes_separable.txt +++ /dev/null @@ -1,2 +0,0 @@ -jl. -jln. diff --git a/src/libpostal.c b/src/libpostal.c index aca879f4..9209de11 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -85,6 +85,29 @@ static inline uint64_t get_normalize_string_options(libpostal_normalize_options_ return normalize_string_options; } + +static inline size_t string_hyphen_prefix_len(char *str, size_t len) { + // Strip beginning hyphens + int32_t unichr; + uint8_t *ptr = (uint8_t *)str; + ssize_t char_len = utf8proc_iterate(ptr, len, &unichr); + if (utf8_is_hyphen(unichr)) { + return (size_t)char_len; + } + return 0; +} + +static inline size_t string_hyphen_suffix_len(char *str, size_t len) { + // Strip beginning hyphens + int32_t unichr; + uint8_t *ptr = (uint8_t *)str; + ssize_t char_len = utf8proc_iterate_reversed(ptr, len, &unichr); + if (utf8_is_hyphen(unichr)) { + return (size_t)char_len; + } + return 0; +} + static void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) { uint64_t normalize_token_options = get_normalize_token_options(options); @@ -97,6 +120,17 @@ static void add_normalized_strings_token(cstring_array *strings, char *str, toke log_debug("str = %s, token = {%zu, %zu, %u}\n", str, token.offset, token.len, token.type); normalize_token(strings, str, token, normalize_token_options); } else if (is_word_token(token.type)) { + + size_t prefix_hyphen_len = string_hyphen_prefix_len(str + token.offset, token.len); + if (prefix_hyphen_len > 0) { + token.offset += prefix_hyphen_len; + } + + size_t suffix_hyphen_len = string_hyphen_suffix_len(str + token.offset, token.len); + if (suffix_hyphen_len > 0) { + token.len -= suffix_hyphen_len; + } + normalize_token(strings, str, token, normalize_token_options); if (options.replace_word_hyphens) { @@ -114,10 +148,17 @@ static void add_normalized_strings_token(cstring_array *strings, char *str, toke } else if (is_numeric_token(token.type)) { normalize_token(strings, str, token, normalize_token_options); - if (options.replace_numeric_hyphens) { - normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS; + if (options.replace_word_hyphens || options.replace_numeric_hyphens) { + if (options.replace_word_hyphens) { + normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS; + } + + if (options.replace_numeric_hyphens) { + normalize_token_options |= NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS; + } + normalize_token(strings, str, token, normalize_token_options); - normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS; + normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS | NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS; } if (options.delete_numeric_hyphens) { @@ -126,18 +167,352 @@ static void add_normalized_strings_token(cstring_array *strings, char *str, toke normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS; } } - + if (is_numeric_token(token.type) && options.split_alpha_from_numeric && numeric_starts_with_alpha(str, token)) { normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; normalize_token(strings, str, token, normalize_token_options); normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; } - } else { cstring_array_add_string(strings, " "); } } +static void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options) { + cstring_array_add_string(strings, str); + + if (options.roman_numerals) { + char *numex_replaced = replace_numeric_expressions(str, LATIN_LANGUAGE_CODE); + if (numex_replaced != NULL) { + cstring_array_add_string(strings, numex_replaced); + free(numex_replaced); + } + + } + +} + + + +static address_expansion_array *get_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options) { + uint32_t expansion_index = phrase.data; + address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + if (value != NULL && value->components & options.address_components) { + return value->expansions; + } + + return NULL; +} + +static inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options) { + if (expansion.canonical_index != NULL_CANONICAL_INDEX) { + char *canonical = address_dictionary_get_canonical(expansion.canonical_index); + uint64_t normalize_string_options = get_normalize_string_options(options); + char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options); + canonical = canonical_normalized != NULL ? canonical_normalized : canonical; + + char_array_cat(key, canonical); + if (canonical_normalized != NULL) { + free(canonical_normalized); + } + } else { + char_array_cat_len(key, str + token.offset + phrase.start, phrase.len); + } +} + + +static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options, bool with_period) { + cstring_array *strings = tree->strings; + + size_t skip_period = with_period ? 1 : 0; + + bool have_suffix = suffix.len > 0 && suffix.len < token.len; + bool have_prefix = prefix.len > 0 && prefix.len + with_period < token.len; + + if (!have_suffix && !have_prefix) { + return false; + } + + address_expansion_array *prefix_expansions = NULL; + address_expansion_array *suffix_expansions = NULL; + + address_expansion_t prefix_expansion; + address_expansion_t suffix_expansion; + + char *expansion; + + size_t num_strings = 0; + char *root_word = NULL; + size_t root_len; + token_t root_token; + cstring_array *root_strings = NULL; + int add_space = 0; + int spaces = 0; + + size_t prefix_start, prefix_end, root_end, suffix_start; + + if (have_prefix) { + prefix_expansions = get_affix_expansions(prefix, options); + if (prefix_expansions == NULL) have_prefix = false; + } + + if (have_suffix) { + suffix_expansions = get_affix_expansions(suffix, options); + if (suffix_expansions == NULL) have_suffix = false; + } + + if (!have_suffix && !have_prefix) { + return false; + } + + char_array *key = char_array_new_size(token.len); + + if (have_prefix && have_suffix) { + for (size_t i = 0; i < prefix_expansions->n; i++) { + prefix_expansion = prefix_expansions->a[i]; + char_array_clear(key); + + cat_affix_expansion(key, str, prefix_expansion, token, prefix, options); + prefix_start = key->n - 1; + + add_space = (int)prefix_expansion.separable || with_period; + if (prefix.len + skip_period + suffix.len < token.len && !prefix_expansion.separable) { + add_space = suffix_expansion.separable || with_period; + } + + for (spaces = skip_period; spaces <= add_space; spaces++) { + key->n = prefix_start; + if (spaces) { + char_array_cat(key, " "); + } + + prefix_end = key->n; + + if (prefix.len + skip_period + suffix.len < token.len) { + root_len = token.len - suffix.len - prefix.len - skip_period; + size_t root_start = token.offset + prefix.len + skip_period; + size_t prefix_hyphen_len = string_hyphen_prefix_len(str + root_start, root_len); + root_start += prefix_hyphen_len; + root_len -= prefix_hyphen_len; + size_t suffix_hyphen_len = string_hyphen_suffix_len(str + root_start, root_len); + root_len -= suffix_hyphen_len; + root_token = (token_t){root_start, root_len, token.type}; + root_strings = cstring_array_new_size(root_len); + add_normalized_strings_token(root_strings, str, root_token, options); + num_strings = cstring_array_num_strings(root_strings); + + for (size_t j = 0; j < num_strings; j++) { + key->n = prefix_end; + root_word = cstring_array_get_string(root_strings, j); + char_array_cat(key, root_word); + root_end = key->n - 1; + + for (size_t k = 0; k < suffix_expansions->n; k++) { + key->n = root_end; + suffix_expansion = suffix_expansions->a[k]; + + int add_suffix_space = suffix_expansion.separable; + + suffix_start = key->n; + for (int suffix_spaces = skip_period; suffix_spaces <= add_suffix_space; suffix_spaces++) { + key->n = suffix_start; + if (suffix_spaces) { + char_array_cat(key, " "); + } + + cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); + + expansion = char_array_get_string(key); + cstring_array_add_string(strings, expansion); + + } + + + } + } + + cstring_array_destroy(root_strings); + root_strings = NULL; + + } else { + for (size_t j = 0; j < suffix_expansions->n; j++) { + key->n = prefix_end - skip_period; + suffix_expansion = suffix_expansions->a[j]; + + cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); + + expansion = char_array_get_string(key); + cstring_array_add_string(tree->strings, expansion); + } + } + } + + } + } else if (have_suffix) { + log_debug("suffix.start=%" PRId32 "\n", suffix.start); + root_len = suffix.start; + root_token = (token_t){token.offset, root_len, token.type}; + log_debug("root_len=%zu\n", root_len); + log_debug("root_token = {%zu, %zu, %u}\n", root_token.offset, root_token.len, root_token.type); + + root_strings = cstring_array_new_size(root_len + 1); + add_normalized_strings_token(root_strings, str, root_token, options); + num_strings = cstring_array_num_strings(root_strings); + + log_debug("num_strings = %zu\n", num_strings); + + for (size_t j = 0; j < num_strings; j++) { + char_array_clear(key); + root_word = cstring_array_get_string(root_strings, j); + log_debug("root_word=%s\n", root_word); + char_array_cat(key, root_word); + root_end = key->n - 1; + + for (size_t k = 0; k < suffix_expansions->n; k++) { + key->n = root_end; + suffix_expansion = suffix_expansions->a[k]; + + add_space = (suffix_expansion.separable || with_period) && suffix.len < token.len; + suffix_start = key->n; + + for (int spaces = skip_period; spaces <= add_space; spaces++) { + key->n = suffix_start; + if (spaces) { + char_array_cat(key, " "); + } + + cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); + + expansion = char_array_get_string(key); + cstring_array_add_string(tree->strings, expansion); + } + } + } + } else if (have_prefix) { + if (prefix.len + skip_period <= token.len) { + root_len = token.len - prefix.len - skip_period; + size_t root_start = token.offset + prefix.len + skip_period; + size_t prefix_hyphen_len = string_hyphen_prefix_len(str + root_start, root_len); + root_start += prefix_hyphen_len; + root_len -= prefix_hyphen_len; + size_t suffix_hyphen_len = string_hyphen_suffix_len(str + root_start, root_len); + root_len -= suffix_hyphen_len; + root_token = (token_t){root_start, root_len, token.type}; + root_strings = cstring_array_new_size(root_len); + add_normalized_strings_token(root_strings, str, root_token, options); + num_strings = cstring_array_num_strings(root_strings); + + } else { + root_strings = cstring_array_new_size(token.len); + add_normalized_strings_token(root_strings, str, token, options); + num_strings = cstring_array_num_strings(root_strings); + + for (size_t k = 0; k < num_strings; k++) { + root_word = cstring_array_get_string(root_strings, k); + cstring_array_add_string(tree->strings, root_word); + } + + char_array_destroy(key); + cstring_array_destroy(root_strings); + return false; + + } + + for (size_t j = 0; j < prefix_expansions->n; j++) { + char_array_clear(key); + prefix_expansion = prefix_expansions->a[j]; + + cat_affix_expansion(key, str, prefix_expansion, token, prefix, options); + prefix_end = key->n - 1; + + add_space = (prefix_expansion.separable || with_period) && prefix.len + skip_period < token.len; + for (int spaces = skip_period; spaces <= add_space; spaces++) { + key->n = prefix_end; + if (spaces) { + char_array_cat(key, " "); + } + size_t prefix_space_len = key->n - spaces; + for (size_t k = 0; k < num_strings; k++) { + key->n = prefix_space_len; + root_word = cstring_array_get_string(root_strings, k); + char_array_cat(key, root_word); + + expansion = char_array_get_string(key); + cstring_array_add_string(tree->strings, expansion); + } + + } + } + } + + char_array_destroy(key); + + if (root_strings != NULL) { + cstring_array_destroy(root_strings); + } + + return true; + +} + +static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { + phrase_t suffix = search_address_dictionaries_suffix(str + token.offset, token.len, lang); + + phrase_t prefix = search_address_dictionaries_prefix(str + token.offset, token.len, lang); + + if ((suffix.len == 0 && prefix.len == 0)) return false; + + bool with_period = false; + + return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period); +} + +static inline bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { + ssize_t first_period_index = string_next_period_len(str + token.offset, token.len); + if (first_period_index > 0) { + ssize_t next_period_index = string_next_period_len(str + token.offset + first_period_index + 1, token.len - first_period_index - 1); + // Token contains only one period or one + a final period + if (next_period_index < 0 || next_period_index == token.len - 1) { + phrase_t prefix = search_address_dictionaries_substring(str + token.offset, first_period_index, lang); + + phrase_t suffix = search_address_dictionaries_substring(str + token.offset + first_period_index + 1, token.len - first_period_index - 1, lang); + if (suffix.len > 0) { + suffix.start = first_period_index + 1; + } + + if (suffix.len == 0 && prefix.len == 0) return false; + + bool with_period = true; + + return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period); + } else { + return false; + } + } else { + return false; + } +} + +static bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options) { + bool have_period_affixes = false; + if (string_contains_period_len(str + token.offset, token.len)) { + for (size_t l = 0; l < options.num_languages; l++) { + char *lang = options.languages[l]; + if (expand_affixes_period(tree, str, lang, token, options)) { + have_period_affixes = true; + break; + } + } + } + + if (!have_period_affixes) { + string_tree_add_string_len(tree, str + token.offset, token.len); + } + + return have_period_affixes; +} + + static string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t options) { char_array *key = NULL; @@ -252,7 +627,7 @@ static string_tree_t *add_string_alternatives(char *str, libpostal_normalize_opt } log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); - string_tree_add_string_len(tree, str + token.offset, token.len); + bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); last_added_was_whitespace = false; } else if (!last_added_was_whitespace) { log_debug("Adding pre-phrase whitespace\n"); @@ -444,7 +819,7 @@ static string_tree_t *add_string_alternatives(char *str, libpostal_normalize_opt } log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); - string_tree_add_string_len(tree, str + token.offset, token.len); + bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); last_added_was_whitespace = false; } else if (!last_added_was_whitespace) { log_debug("Adding space IV\n"); @@ -479,7 +854,7 @@ static string_tree_t *add_string_alternatives(char *str, libpostal_normalize_opt string_tree_finalize_token(tree); } - string_tree_add_string_len(tree, str + token.offset, token.len); + bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); last_added_was_whitespace = false; } else if (!last_added_was_whitespace) { log_debug("Adding space VI\n"); @@ -503,275 +878,6 @@ static string_tree_t *add_string_alternatives(char *str, libpostal_normalize_opt return tree; } -static void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options) { - cstring_array_add_string(strings, str); - - if (options.roman_numerals) { - char *numex_replaced = replace_numeric_expressions(str, LATIN_LANGUAGE_CODE); - if (numex_replaced != NULL) { - cstring_array_add_string(strings, numex_replaced); - free(numex_replaced); - } - - } - -} - - - -static address_expansion_array *get_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options) { - uint32_t expansion_index = phrase.data; - address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); - if (value != NULL && value->components & options.address_components) { - return value->expansions; - } - - return NULL; -} - -static inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options) { - if (expansion.canonical_index != NULL_CANONICAL_INDEX) { - char *canonical = address_dictionary_get_canonical(expansion.canonical_index); - uint64_t normalize_string_options = get_normalize_string_options(options); - char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options); - canonical = canonical_normalized != NULL ? canonical_normalized : canonical; - - char_array_cat(key, canonical); - if (canonical_normalized != NULL) { - free(canonical_normalized); - } - } else { - char_array_cat_len(key, str + token.offset + phrase.start, phrase.len); - } -} - -static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options) { - cstring_array *strings = tree->strings; - - bool have_suffix = suffix.len > 0 && suffix.len < token.len; - bool have_prefix = prefix.len > 0 && prefix.len < token.len; - - if (!have_suffix && !have_prefix) { - return false; - } - - address_expansion_array *prefix_expansions = NULL; - address_expansion_array *suffix_expansions = NULL; - - address_expansion_t prefix_expansion; - address_expansion_t suffix_expansion; - - char *expansion; - - size_t num_strings = 0; - char *root_word = NULL; - size_t root_len; - token_t root_token; - cstring_array *root_strings = NULL; - int add_space = 0; - int spaces = 0; - - size_t prefix_start, prefix_end, root_end, suffix_start; - - if (have_prefix) { - prefix_expansions = get_affix_expansions(prefix, options); - if (prefix_expansions == NULL) have_prefix = false; - } - - if (have_suffix) { - suffix_expansions = get_affix_expansions(suffix, options); - if (suffix_expansions == NULL) have_suffix = false; - } - - if (!have_suffix && !have_prefix) { - return false; - } - - char_array *key = char_array_new_size(token.len); - - if (have_prefix && have_suffix) { - for (size_t i = 0; i < prefix_expansions->n; i++) { - prefix_expansion = prefix_expansions->a[i]; - char_array_clear(key); - - cat_affix_expansion(key, str, prefix_expansion, token, prefix, options); - prefix_start = key->n - 1; - - add_space = (int)prefix_expansion.separable; - if (prefix.len + suffix.len < token.len && !prefix_expansion.separable) { - add_space = suffix_expansion.separable; - } - - for (spaces = 0; spaces <= add_space; spaces++) { - key->n = prefix_start; - if (spaces) { - char_array_cat(key, " "); - } - - prefix_end = key->n; - - if (prefix.len + suffix.len < token.len) { - root_len = token.len - suffix.len - prefix.len; - root_token = (token_t){token.offset + prefix.len, root_len, token.type}; - root_strings = cstring_array_new_size(root_len); - add_normalized_strings_token(root_strings, str, root_token, options); - num_strings = cstring_array_num_strings(root_strings); - - for (size_t j = 0; j < num_strings; j++) { - key->n = prefix_end; - root_word = cstring_array_get_string(root_strings, j); - char_array_cat(key, root_word); - root_end = key->n - 1; - - for (size_t k = 0; k < suffix_expansions->n; k++) { - key->n = root_end; - suffix_expansion = suffix_expansions->a[k]; - - int add_suffix_space = suffix_expansion.separable; - - suffix_start = key->n; - for (int suffix_spaces = 0; suffix_spaces <= add_suffix_space; suffix_spaces++) { - key->n = suffix_start; - if (suffix_spaces) { - char_array_cat(key, " "); - } - - cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); - - expansion = char_array_get_string(key); - cstring_array_add_string(strings, expansion); - - } - - - } - } - - cstring_array_destroy(root_strings); - root_strings = NULL; - - } else { - for (size_t j = 0; j < suffix_expansions->n; j++) { - key->n = prefix_end; - suffix_expansion = suffix_expansions->a[j]; - - cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); - - expansion = char_array_get_string(key); - cstring_array_add_string(tree->strings, expansion); - } - } - } - - } - } else if (have_suffix) { - log_debug("suffix.start=%" PRId32 "\n", suffix.start); - root_len = suffix.start; - root_token = (token_t){token.offset, root_len, token.type}; - log_debug("root_len=%zu\n", root_len); - log_debug("root_token = {%zu, %zu, %u}\n", root_token.offset, root_token.len, root_token.type); - - root_strings = cstring_array_new_size(root_len + 1); - add_normalized_strings_token(root_strings, str, root_token, options); - num_strings = cstring_array_num_strings(root_strings); - - log_debug("num_strings = %zu\n", num_strings); - - for (size_t j = 0; j < num_strings; j++) { - char_array_clear(key); - root_word = cstring_array_get_string(root_strings, j); - log_debug("root_word=%s\n", root_word); - char_array_cat(key, root_word); - root_end = key->n - 1; - - for (size_t k = 0; k < suffix_expansions->n; k++) { - key->n = root_end; - suffix_expansion = suffix_expansions->a[k]; - - add_space = suffix_expansion.separable && suffix.len < token.len; - suffix_start = key->n; - - for (int spaces = 0; spaces <= add_space; spaces++) { - key->n = suffix_start; - if (spaces) { - char_array_cat(key, " "); - } - - cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); - - expansion = char_array_get_string(key); - cstring_array_add_string(tree->strings, expansion); - } - } - } - } else if (have_prefix) { - if (prefix.len <= token.len) { - root_len = token.len - prefix.len; - root_token = (token_t){token.offset + prefix.len, root_len, token.type}; - root_strings = cstring_array_new_size(root_len); - add_normalized_strings_token(root_strings, str, root_token, options); - num_strings = cstring_array_num_strings(root_strings); - - } else { - root_strings = cstring_array_new_size(token.len); - add_normalized_strings_token(root_strings, str, token, options); - num_strings = cstring_array_num_strings(root_strings); - - for (size_t k = 0; k < num_strings; k++) { - root_word = cstring_array_get_string(root_strings, k); - cstring_array_add_string(tree->strings, root_word); - } - - char_array_destroy(key); - cstring_array_destroy(root_strings); - return false; - - } - - for (size_t j = 0; j < prefix_expansions->n; j++) { - char_array_clear(key); - prefix_expansion = prefix_expansions->a[j]; - - cat_affix_expansion(key, str, prefix_expansion, token, prefix, options); - prefix_end = key->n - 1; - - add_space = prefix_expansion.separable && prefix.len < token.len; - for (int spaces = 0; spaces <= add_space; spaces++) { - key->n = prefix_end; - if (spaces) { - char_array_cat(key, " "); - } - for (size_t k = 0; k < num_strings; k++) { - root_word = cstring_array_get_string(root_strings, k); - char_array_cat(key, root_word); - - expansion = char_array_get_string(key); - cstring_array_add_string(tree->strings, expansion); - } - - } - } - } - - char_array_destroy(key); - - if (root_strings != NULL) { - cstring_array_destroy(root_strings); - } - - return true; - -} - -static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { - phrase_t suffix = search_address_dictionaries_suffix(str + token.offset, token.len, lang); - - phrase_t prefix = search_address_dictionaries_prefix(str + token.offset, token.len, lang); - - if ((suffix.len == 0 && prefix.len == 0)) return false; - - return add_affix_expansions(tree, str, lang, token, prefix, suffix, options); -} static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); diff --git a/src/libpostal.h b/src/libpostal.h index 274c6391..2c651817 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -160,6 +160,12 @@ bool libpostal_setup_parser(void); bool libpostal_setup_parser_datadir(char *datadir); void libpostal_teardown_parser(void); +bool libpostal_setup_language_classifier(void); +bool libpostal_setup_language_classifier_datadir(char *datadir); +void libpostal_teardown_language_classifier(void); + +/* Tokenization and token normalization APIs */ + typedef struct libpostal_token { size_t offset; size_t len; @@ -190,6 +196,7 @@ libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n); #define LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC 1 << 6 #define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS 1 << 7 #define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS 1 << 8 +#define LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS 1 << 9 #define LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS (LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII | LIBPOSTAL_NORMALIZE_STRING_COMPOSE | LIBPOSTAL_NORMALIZE_STRING_TRIM | LIBPOSTAL_NORMALIZE_STRING_REPLACE_HYPHENS | LIBPOSTAL_NORMALIZE_STRING_STRIP_ACCENTS | LIBPOSTAL_NORMALIZE_STRING_LOWERCASE) @@ -209,10 +216,6 @@ typedef struct libpostal_normalized_token { libpostal_normalized_token_t *libpostal_normalized_tokens(char *input, uint64_t string_options, uint64_t token_options, bool whitespace, size_t *n); -bool libpostal_setup_language_classifier(void); -bool libpostal_setup_language_classifier_datadir(char *datadir); -void libpostal_teardown_language_classifier(void); - #ifdef __cplusplus } #endif diff --git a/src/normalize.c b/src/normalize.c index 076b6e56..aa9f2ef1 100644 --- a/src/normalize.c +++ b/src/normalize.c @@ -400,9 +400,12 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t char *append_if_not_numeric = NULL; int32_t ch; + int32_t next_ch; ssize_t char_len; + ssize_t next_char_len; bool last_was_letter = false; + bool last_was_number = false; bool append_char = true; while (idx < len) { @@ -416,9 +419,14 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t bool is_letter = utf8_is_letter(cat); bool is_number = utf8_is_number(cat); + next_char_len = utf8proc_iterate(ptr + char_len, len, &next_ch); + int next_cat = utf8proc_category(next_ch); + bool next_is_number = utf8_is_number(next_cat); + + bool is_full_stop = ch == FULL_STOP_CODEPOINT; - if (is_hyphen && last_was_letter && options & NORMALIZE_TOKEN_REPLACE_HYPHENS) { + if (is_hyphen && options & NORMALIZE_TOKEN_REPLACE_HYPHENS && (!(last_was_number && next_is_number) || options & NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS)) { char_array_append(array, " "); append_char = false; } else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) { @@ -481,7 +489,7 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t append_char = true; last_was_letter = is_letter; - + last_was_number = is_number; } } diff --git a/src/normalize.h b/src/normalize.h index 755b7cee..9d58f78b 100644 --- a/src/normalize.h +++ b/src/normalize.h @@ -35,6 +35,7 @@ As well as normalizations for individual string tokens: #include "utf8proc/utf8proc.h" #include "unicode_scripts.h" #include "numex.h" +#include "scanner.h" #include "transliterate.h" #include "trie.h" #include "tokens.h" @@ -60,6 +61,7 @@ As well as normalizations for individual string tokens: #define NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC #define NORMALIZE_TOKEN_REPLACE_DIGITS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_DIGITS #define NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_TOKEN_LETTERS +#define NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS LIBPOSTAL_NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS // Replace digits with capital D e.g. 10013 => DDDDD, intended for use with lowercased strings #define DIGIT_CHAR "D" From 2d6079b06f3a5be427f108ddf3dd42321774c790 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 28 Oct 2017 02:40:14 -0400 Subject: [PATCH 20/89] [expand] added search_address_dictionaries_substring to support the new use case (i.e. returns "does this substring in the trie?" regardless of if it's stored under the special prefixes/suffixes namespaces) --- src/address_dictionary.c | 25 +++++++++++++++++++++++++ src/address_dictionary.h | 1 + 2 files changed, 26 insertions(+) diff --git a/src/address_dictionary.c b/src/address_dictionary.c index ed1116f6..fd3fe471 100644 --- a/src/address_dictionary.c +++ b/src/address_dictionary.c @@ -251,6 +251,31 @@ phrase_array *search_address_dictionaries_tokens(char *str, token_array *tokens, return phrases; } + +phrase_t search_address_dictionaries_substring(char *str, size_t len, char *lang) { + if (str == NULL) return NULL_PHRASE; + if (address_dict == NULL) { + log_error(ADDRESS_DICTIONARY_SETUP_ERROR); + return NULL_PHRASE; + } + + trie_prefix_result_t prefix = get_language_prefix(lang); + + if (prefix.node_id == NULL_NODE_ID) { + log_debug("prefix.node_id == NULL_NODE_ID\n"); + return NULL_PHRASE; + } + + phrase_t phrase = trie_search_prefixes_from_index(address_dict->trie, str, len, prefix.node_id); + if (phrase.len == len) { + return phrase; + } else { + return NULL_PHRASE; + } + +} + + phrase_t search_address_dictionaries_prefix(char *str, size_t len, char *lang) { if (str == NULL) return NULL_PHRASE; if (address_dict == NULL) { diff --git a/src/address_dictionary.h b/src/address_dictionary.h index cc5e8748..1a80ed6c 100644 --- a/src/address_dictionary.h +++ b/src/address_dictionary.h @@ -63,6 +63,7 @@ bool search_address_dictionaries_with_phrases(char *str, char *lang, phrase_arra phrase_array *search_address_dictionaries_tokens(char *str, token_array *tokens, char *lang); bool search_address_dictionaries_tokens_with_phrases(char *str, token_array *tokens, char *lang, phrase_array **phrases); +phrase_t search_address_dictionaries_substring(char *str, size_t len, char *lang); phrase_t search_address_dictionaries_prefix(char *str, size_t len, char *lang); phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang); From bc9f11d6e37a4c604648e8c3b4ddf21721195fd6 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 28 Oct 2017 02:45:48 -0400 Subject: [PATCH 21/89] [similarity] exposing unicode versions of Damerau-Levenshtein and Jaro-Winkler distances --- src/string_similarity.c | 79 ++++++++++++++++++++++++----------------- src/string_similarity.h | 6 ++++ 2 files changed, 52 insertions(+), 33 deletions(-) diff --git a/src/string_similarity.c b/src/string_similarity.c index 899b9ec9..1043c708 100644 --- a/src/string_similarity.c +++ b/src/string_similarity.c @@ -1,8 +1,7 @@ #include "string_similarity.h" #include "string_utils.h" - -size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) { +ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) { size_t len1 = u1_array->n; size_t len2 = u2_array->n; @@ -12,6 +11,10 @@ size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array size_t num_bytes = (len1 + 1) * sizeof(size_t); size_t *column = malloc(num_bytes); + if (column == NULL) { + return -1.0; + } + for (size_t y = 1; y <= len1; y++) { column[y] = y; } @@ -47,26 +50,26 @@ size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array size_t dist = column[len1]; free(column); - return dist; + return (ssize_t)dist; } ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost) { if (s1 == NULL || s2 == NULL) return -1; - uint32_array *u1 = unicode_codepoints(s1); - if (u1 == NULL) return -1.0; + uint32_array *u1_array = unicode_codepoints(s1); + if (u1_array == NULL) return -1.0; - uint32_array *u2 = unicode_codepoints(s2); + uint32_array *u2_array = unicode_codepoints(s2); - if (u2 == NULL) { - uint32_array_destroy(u1); + if (u2_array == NULL) { + uint32_array_destroy(u1_array); return -1.0; } - ssize_t lev = damerau_levenshtein_distance_unicode(u1, u2, replace_cost); + ssize_t lev = damerau_levenshtein_distance_unicode(u1_array, u2_array, replace_cost); - uint32_array_destroy(u1); - uint32_array_destroy(u2); + uint32_array_destroy(u1_array); + uint32_array_destroy(u2_array); return lev; } @@ -151,27 +154,6 @@ double jaro_distance(const char *s1, const char *s2) { return -1.0; } - uint32_array *u1 = unicode_codepoints(s1); - if (u1 == NULL) return -1.0; - - uint32_array *u2 = unicode_codepoints(s2); - - if (u2 == NULL) { - uint32_array_destroy(u1); - return -1.0; - } - - double jaro = jaro_distance_unicode(u1, u2); - uint32_array_destroy(u1); - uint32_array_destroy(u2); - return jaro; -} - -double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold) { - if (s1 == NULL || s2 == NULL) { - return -1.0; - } - uint32_array *u1_array = unicode_codepoints(s1); if (u1_array == NULL) return -1.0; @@ -182,6 +164,13 @@ double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, do return -1.0; } + double jaro = jaro_distance_unicode(u1_array, u2_array); + uint32_array_destroy(u1_array); + uint32_array_destroy(u2_array); + return jaro; +} + +double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold) { double jaro = jaro_distance_unicode(u1_array, u2_array); double j; @@ -205,12 +194,36 @@ double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, do jaro_winkler += (1.0 - jaro_winkler) * i * prefix_scale; } + return jaro_winkler > 1.0 ? 1.0 : jaro_winkler; +} + +double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold) { + if (s1 == NULL || s2 == NULL) { + return -1.0; + } + + uint32_array *u1_array = unicode_codepoints(s1); + if (u1_array == NULL) return -1.0; + + uint32_array *u2_array = unicode_codepoints(s2); + + if (u2_array == NULL) { + uint32_array_destroy(u1_array); + return -1.0; + } + + double jaro_winkler = jaro_winkler_distance_unicode_prefix_threshold(u1_array, u2_array, prefix_scale, bonus_threshold); + uint32_array_destroy(u1_array); uint32_array_destroy(u2_array); - return jaro_winkler > 1.0 ? 1.0 : jaro_winkler; + return jaro_winkler; } inline double jaro_winkler_distance(const char *s1, const char *s2) { return jaro_winkler_distance_prefix_threshold(s1, s2, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD); } + +inline double jaro_winkler_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) { + return jaro_winkler_distance_unicode_prefix_threshold(u1_array, u2_array, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD); +} diff --git a/src/string_similarity.h b/src/string_similarity.h index 77f82e8d..2fa1005b 100644 --- a/src/string_similarity.h +++ b/src/string_similarity.h @@ -4,15 +4,21 @@ #include #include +#include "collections.h" + #define DEFAULT_JARO_WINKLER_PREFIX_SCALE 0.1 #define DEFAULT_JARO_WINKLER_BONUS_THRESHOLD 0.7 ssize_t damerau_levenshtein_distance(const char *s1, const char *s2); +ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost); ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost); double jaro_distance(const char *s1, const char *s2); +double jaro_distance_unicode(uint32_array *u1_array, uint32_array *u2_array); double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold); +double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold); double jaro_winkler_distance(const char *s1, const char *s2); +double jaro_winkler_distance_unicode(uint32_array *u1_array, uint32_array *u2_array); #endif From 3c6629ae3d24b6914cb3d065bbed8faaed9983a0 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 28 Oct 2017 17:22:14 -0400 Subject: [PATCH 22/89] [dictionaries] adding variants of & as synonyms in all languages --- resources/dictionaries/ca/ambiguous_expansions.txt | 1 + resources/dictionaries/ca/stopwords.txt | 2 +- resources/dictionaries/cs/ambiguous_expansions.txt | 1 + resources/dictionaries/cs/stopwords.txt | 2 +- resources/dictionaries/da/ambiguous_expansions.txt | 1 + resources/dictionaries/da/stopwords.txt | 1 + resources/dictionaries/de/ambiguous_expansions.txt | 1 + resources/dictionaries/en/ambiguous_expansions.txt | 1 + resources/dictionaries/en/stopwords.txt | 6 ++++-- resources/dictionaries/es/ambiguous_expansions.txt | 1 + resources/dictionaries/es/stopwords.txt | 2 +- resources/dictionaries/et/ambiguous_expansions.txt | 1 + resources/dictionaries/et/stopwords.txt | 1 + resources/dictionaries/eu/ambiguous_expansions.txt | 1 + resources/dictionaries/eu/stopwords.txt | 1 + resources/dictionaries/fi/ambiguous_expansions.txt | 1 + resources/dictionaries/fi/stopwords.txt | 1 + resources/dictionaries/fr/ambiguous_expansions.txt | 1 + resources/dictionaries/fr/stopwords.txt | 2 +- resources/dictionaries/gl/ambiguous_expansions.txt | 1 + resources/dictionaries/gl/stopwords.txt | 2 +- resources/dictionaries/hr/ambiguous_expansions.txt | 1 + resources/dictionaries/hr/stopwords.txt | 1 + resources/dictionaries/hu/ambiguous_expansions.txt | 1 + resources/dictionaries/hu/stopwords.txt | 2 +- resources/dictionaries/id/ambiguous_expansions.txt | 1 + resources/dictionaries/id/stopwords.txt | 2 +- resources/dictionaries/is/ambiguous_expansions.txt | 2 ++ resources/dictionaries/is/stopwords.txt | 1 + resources/dictionaries/it/ambiguous_expansions.txt | 1 + resources/dictionaries/it/stopwords.txt | 1 + resources/dictionaries/ka/ambiguous_expansions.txt | 1 + resources/dictionaries/ka/stopwords.txt | 2 +- resources/dictionaries/lt/ambiguous_expansions.txt | 1 + resources/dictionaries/lt/stopwords.txt | 1 + resources/dictionaries/lv/ambiguous_expansions.txt | 1 + resources/dictionaries/lv/stopwords.txt | 1 + resources/dictionaries/ms/ambiguous_expansions.txt | 1 + resources/dictionaries/ms/stopwords.txt | 1 + resources/dictionaries/mt/ambiguous_expansions.txt | 1 + resources/dictionaries/mt/stopwords.txt | 3 ++- resources/dictionaries/nb/ambiguous_expansions.txt | 1 + resources/dictionaries/nb/stopwords.txt | 2 +- resources/dictionaries/nl/ambiguous_expansions.txt | 1 + resources/dictionaries/nl/stopwords.txt | 2 +- resources/dictionaries/pl/ambiguous_expansions.txt | 1 + resources/dictionaries/pl/stopwords.txt | 2 +- resources/dictionaries/pt/ambiguous_expansions.txt | 1 + resources/dictionaries/pt/stopwords.txt | 2 +- resources/dictionaries/ro/ambiguous_expansions.txt | 1 + resources/dictionaries/ro/stopwords.txt | 1 + resources/dictionaries/ru/ambiguous_expansions.txt | 1 + resources/dictionaries/ru/stopwords.txt | 1 + resources/dictionaries/sk/ambiguous_expansions.txt | 1 + resources/dictionaries/sk/stopwords.txt | 2 +- resources/dictionaries/sl/ambiguous_expansions.txt | 1 + resources/dictionaries/sl/stopwords.txt | 1 + resources/dictionaries/sr/ambiguous_expansions.txt | 1 + resources/dictionaries/sr/stopwords.txt | 2 ++ resources/dictionaries/sv/ambiguous_expansions.txt | 1 + resources/dictionaries/sv/stopwords.txt | 1 + resources/dictionaries/tr/ambiguous_expansions.txt | 1 + resources/dictionaries/tr/stopwords.txt | 1 + resources/dictionaries/uk/ambiguous_expansions.txt | 1 + resources/dictionaries/uk/stopwords.txt | 2 ++ 65 files changed, 72 insertions(+), 16 deletions(-) create mode 100644 resources/dictionaries/da/stopwords.txt create mode 100644 resources/dictionaries/et/stopwords.txt create mode 100644 resources/dictionaries/eu/stopwords.txt create mode 100644 resources/dictionaries/fi/stopwords.txt create mode 100644 resources/dictionaries/hr/stopwords.txt create mode 100644 resources/dictionaries/is/stopwords.txt create mode 100644 resources/dictionaries/lt/stopwords.txt create mode 100644 resources/dictionaries/lv/stopwords.txt create mode 100644 resources/dictionaries/ms/ambiguous_expansions.txt create mode 100644 resources/dictionaries/ms/stopwords.txt create mode 100644 resources/dictionaries/mt/ambiguous_expansions.txt create mode 100644 resources/dictionaries/ru/stopwords.txt create mode 100644 resources/dictionaries/sr/stopwords.txt create mode 100644 resources/dictionaries/tr/stopwords.txt create mode 100644 resources/dictionaries/uk/stopwords.txt diff --git a/resources/dictionaries/ca/ambiguous_expansions.txt b/resources/dictionaries/ca/ambiguous_expansions.txt index 60685bd0..669e46b8 100644 --- a/resources/dictionaries/ca/ambiguous_expansions.txt +++ b/resources/dictionaries/ca/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& b d e diff --git a/resources/dictionaries/ca/stopwords.txt b/resources/dictionaries/ca/stopwords.txt index fbc06ff6..eeb8905a 100644 --- a/resources/dictionaries/ca/stopwords.txt +++ b/resources/dictionaries/ca/stopwords.txt @@ -13,7 +13,7 @@ el els es entre -i +i|& l' la les diff --git a/resources/dictionaries/cs/ambiguous_expansions.txt b/resources/dictionaries/cs/ambiguous_expansions.txt index 9eae731c..157fc657 100644 --- a/resources/dictionaries/cs/ambiguous_expansions.txt +++ b/resources/dictionaries/cs/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& c j s diff --git a/resources/dictionaries/cs/stopwords.txt b/resources/dictionaries/cs/stopwords.txt index 2e65efe2..21d17436 100644 --- a/resources/dictionaries/cs/stopwords.txt +++ b/resources/dictionaries/cs/stopwords.txt @@ -1 +1 @@ -a \ No newline at end of file +a|& \ No newline at end of file diff --git a/resources/dictionaries/da/ambiguous_expansions.txt b/resources/dictionaries/da/ambiguous_expansions.txt index d0990461..cb32539b 100644 --- a/resources/dictionaries/da/ambiguous_expansions.txt +++ b/resources/dictionaries/da/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& c n o diff --git a/resources/dictionaries/da/stopwords.txt b/resources/dictionaries/da/stopwords.txt new file mode 100644 index 00000000..fbda6bfa --- /dev/null +++ b/resources/dictionaries/da/stopwords.txt @@ -0,0 +1 @@ +og|& \ No newline at end of file diff --git a/resources/dictionaries/de/ambiguous_expansions.txt b/resources/dictionaries/de/ambiguous_expansions.txt index eaf4cbbf..63700fcc 100644 --- a/resources/dictionaries/de/ambiguous_expansions.txt +++ b/resources/dictionaries/de/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& a b ch diff --git a/resources/dictionaries/en/ambiguous_expansions.txt b/resources/dictionaries/en/ambiguous_expansions.txt index dad6c2b7..a4de4500 100644 --- a/resources/dictionaries/en/ambiguous_expansions.txt +++ b/resources/dictionaries/en/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& aat act ab diff --git a/resources/dictionaries/en/stopwords.txt b/resources/dictionaries/en/stopwords.txt index 812b21f1..b033a3d4 100644 --- a/resources/dictionaries/en/stopwords.txt +++ b/resources/dictionaries/en/stopwords.txt @@ -1,10 +1,12 @@ -and +and|& all at between|betw|btwn|btw|btween|b / t by +in of +on the to via -opposite \ No newline at end of file +opposite|opp \ No newline at end of file diff --git a/resources/dictionaries/es/ambiguous_expansions.txt b/resources/dictionaries/es/ambiguous_expansions.txt index 8b443427..0ca210eb 100644 --- a/resources/dictionaries/es/ambiguous_expansions.txt +++ b/resources/dictionaries/es/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& c cr d diff --git a/resources/dictionaries/es/stopwords.txt b/resources/dictionaries/es/stopwords.txt index 206d8773..4c309c30 100644 --- a/resources/dictionaries/es/stopwords.txt +++ b/resources/dictionaries/es/stopwords.txt @@ -26,4 +26,4 @@ por sin un una -y \ No newline at end of file +y|& \ No newline at end of file diff --git a/resources/dictionaries/et/ambiguous_expansions.txt b/resources/dictionaries/et/ambiguous_expansions.txt index d97bc2d4..538bfca6 100644 --- a/resources/dictionaries/et/ambiguous_expansions.txt +++ b/resources/dictionaries/et/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& k l p diff --git a/resources/dictionaries/et/stopwords.txt b/resources/dictionaries/et/stopwords.txt new file mode 100644 index 00000000..fa41c60f --- /dev/null +++ b/resources/dictionaries/et/stopwords.txt @@ -0,0 +1 @@ +ja|& \ No newline at end of file diff --git a/resources/dictionaries/eu/ambiguous_expansions.txt b/resources/dictionaries/eu/ambiguous_expansions.txt index 23fa7d31..553961dd 100644 --- a/resources/dictionaries/eu/ambiguous_expansions.txt +++ b/resources/dictionaries/eu/ambiguous_expansions.txt @@ -1 +1,2 @@ +& k \ No newline at end of file diff --git a/resources/dictionaries/eu/stopwords.txt b/resources/dictionaries/eu/stopwords.txt new file mode 100644 index 00000000..6aafbc3a --- /dev/null +++ b/resources/dictionaries/eu/stopwords.txt @@ -0,0 +1 @@ +eta|& \ No newline at end of file diff --git a/resources/dictionaries/fi/ambiguous_expansions.txt b/resources/dictionaries/fi/ambiguous_expansions.txt index 48370ef9..6dc2a2f1 100644 --- a/resources/dictionaries/fi/ambiguous_expansions.txt +++ b/resources/dictionaries/fi/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& k p r diff --git a/resources/dictionaries/fi/stopwords.txt b/resources/dictionaries/fi/stopwords.txt new file mode 100644 index 00000000..fa41c60f --- /dev/null +++ b/resources/dictionaries/fi/stopwords.txt @@ -0,0 +1 @@ +ja|& \ No newline at end of file diff --git a/resources/dictionaries/fr/ambiguous_expansions.txt b/resources/dictionaries/fr/ambiguous_expansions.txt index ca1ae415..e4ebf822 100644 --- a/resources/dictionaries/fr/ambiguous_expansions.txt +++ b/resources/dictionaries/fr/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& a ab bc diff --git a/resources/dictionaries/fr/stopwords.txt b/resources/dictionaries/fr/stopwords.txt index 7d323c41..5d19da08 100644 --- a/resources/dictionaries/fr/stopwords.txt +++ b/resources/dictionaries/fr/stopwords.txt @@ -15,7 +15,7 @@ du en en face de entre -et +et|& l' la le diff --git a/resources/dictionaries/gl/ambiguous_expansions.txt b/resources/dictionaries/gl/ambiguous_expansions.txt index 4634c792..127bded4 100644 --- a/resources/dictionaries/gl/ambiguous_expansions.txt +++ b/resources/dictionaries/gl/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& e n o diff --git a/resources/dictionaries/gl/stopwords.txt b/resources/dictionaries/gl/stopwords.txt index 65cd1a9d..dfaea074 100644 --- a/resources/dictionaries/gl/stopwords.txt +++ b/resources/dictionaries/gl/stopwords.txt @@ -15,7 +15,7 @@ deles delas detras do -é +e|& en encima enfronte diff --git a/resources/dictionaries/hr/ambiguous_expansions.txt b/resources/dictionaries/hr/ambiguous_expansions.txt index 9e1e58ce..dcfd416e 100644 --- a/resources/dictionaries/hr/ambiguous_expansions.txt +++ b/resources/dictionaries/hr/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& c i j diff --git a/resources/dictionaries/hr/stopwords.txt b/resources/dictionaries/hr/stopwords.txt new file mode 100644 index 00000000..34661a71 --- /dev/null +++ b/resources/dictionaries/hr/stopwords.txt @@ -0,0 +1 @@ +i|& \ No newline at end of file diff --git a/resources/dictionaries/hu/ambiguous_expansions.txt b/resources/dictionaries/hu/ambiguous_expansions.txt index 367eb051..88190f16 100644 --- a/resources/dictionaries/hu/ambiguous_expansions.txt +++ b/resources/dictionaries/hu/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& d e k diff --git a/resources/dictionaries/hu/stopwords.txt b/resources/dictionaries/hu/stopwords.txt index b426abeb..6545dae0 100644 --- a/resources/dictionaries/hu/stopwords.txt +++ b/resources/dictionaries/hu/stopwords.txt @@ -1,4 +1,4 @@ a az egy -és|es \ No newline at end of file +és|es|& \ No newline at end of file diff --git a/resources/dictionaries/id/ambiguous_expansions.txt b/resources/dictionaries/id/ambiguous_expansions.txt index 02700b38..fba1c684 100644 --- a/resources/dictionaries/id/ambiguous_expansions.txt +++ b/resources/dictionaries/id/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& bg bu di diff --git a/resources/dictionaries/id/stopwords.txt b/resources/dictionaries/id/stopwords.txt index efa1719b..ee774698 100644 --- a/resources/dictionaries/id/stopwords.txt +++ b/resources/dictionaries/id/stopwords.txt @@ -1,5 +1,5 @@ berlawanan|lawanan|lwnn -dan|dn|n +dan|dn|n|en|& dari|dr dekat|dkt di diff --git a/resources/dictionaries/is/ambiguous_expansions.txt b/resources/dictionaries/is/ambiguous_expansions.txt index e2e18f37..65a14de0 100644 --- a/resources/dictionaries/is/ambiguous_expansions.txt +++ b/resources/dictionaries/is/ambiguous_expansions.txt @@ -1,4 +1,6 @@ +& a n +og s v \ No newline at end of file diff --git a/resources/dictionaries/is/stopwords.txt b/resources/dictionaries/is/stopwords.txt new file mode 100644 index 00000000..fbda6bfa --- /dev/null +++ b/resources/dictionaries/is/stopwords.txt @@ -0,0 +1 @@ +og|& \ No newline at end of file diff --git a/resources/dictionaries/it/ambiguous_expansions.txt b/resources/dictionaries/it/ambiguous_expansions.txt index 1bcea6fb..a2723dc9 100644 --- a/resources/dictionaries/it/ambiguous_expansions.txt +++ b/resources/dictionaries/it/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& c e l diff --git a/resources/dictionaries/it/stopwords.txt b/resources/dictionaries/it/stopwords.txt index 756c3ef7..1df9448f 100644 --- a/resources/dictionaries/it/stopwords.txt +++ b/resources/dictionaries/it/stopwords.txt @@ -24,6 +24,7 @@ dell' dentro|d.tro|dtro di d' +e|& fuori gli i diff --git a/resources/dictionaries/ka/ambiguous_expansions.txt b/resources/dictionaries/ka/ambiguous_expansions.txt index 926a453d..ea2bb480 100644 --- a/resources/dictionaries/ka/ambiguous_expansions.txt +++ b/resources/dictionaries/ka/ambiguous_expansions.txt @@ -1 +1,2 @@ +& ქ \ No newline at end of file diff --git a/resources/dictionaries/ka/stopwords.txt b/resources/dictionaries/ka/stopwords.txt index 01648f46..217f7039 100644 --- a/resources/dictionaries/ka/stopwords.txt +++ b/resources/dictionaries/ka/stopwords.txt @@ -1 +1 @@ -და \ No newline at end of file +და|& \ No newline at end of file diff --git a/resources/dictionaries/lt/ambiguous_expansions.txt b/resources/dictionaries/lt/ambiguous_expansions.txt index bd110234..04b336c5 100644 --- a/resources/dictionaries/lt/ambiguous_expansions.txt +++ b/resources/dictionaries/lt/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& a g k diff --git a/resources/dictionaries/lt/stopwords.txt b/resources/dictionaries/lt/stopwords.txt new file mode 100644 index 00000000..06eeff5e --- /dev/null +++ b/resources/dictionaries/lt/stopwords.txt @@ -0,0 +1 @@ +ir|& \ No newline at end of file diff --git a/resources/dictionaries/lv/ambiguous_expansions.txt b/resources/dictionaries/lv/ambiguous_expansions.txt index 9cae39b7..3dd31f68 100644 --- a/resources/dictionaries/lv/ambiguous_expansions.txt +++ b/resources/dictionaries/lv/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& a d g diff --git a/resources/dictionaries/lv/stopwords.txt b/resources/dictionaries/lv/stopwords.txt new file mode 100644 index 00000000..c4409fcb --- /dev/null +++ b/resources/dictionaries/lv/stopwords.txt @@ -0,0 +1 @@ +un|& \ No newline at end of file diff --git a/resources/dictionaries/ms/ambiguous_expansions.txt b/resources/dictionaries/ms/ambiguous_expansions.txt new file mode 100644 index 00000000..00b15c0a --- /dev/null +++ b/resources/dictionaries/ms/ambiguous_expansions.txt @@ -0,0 +1 @@ +& \ No newline at end of file diff --git a/resources/dictionaries/ms/stopwords.txt b/resources/dictionaries/ms/stopwords.txt new file mode 100644 index 00000000..ff6fa2f8 --- /dev/null +++ b/resources/dictionaries/ms/stopwords.txt @@ -0,0 +1 @@ +dan|& \ No newline at end of file diff --git a/resources/dictionaries/mt/ambiguous_expansions.txt b/resources/dictionaries/mt/ambiguous_expansions.txt new file mode 100644 index 00000000..00b15c0a --- /dev/null +++ b/resources/dictionaries/mt/ambiguous_expansions.txt @@ -0,0 +1 @@ +& \ No newline at end of file diff --git a/resources/dictionaries/mt/stopwords.txt b/resources/dictionaries/mt/stopwords.txt index bc46bf88..059d1b8d 100644 --- a/resources/dictionaries/mt/stopwords.txt +++ b/resources/dictionaries/mt/stopwords.txt @@ -1,4 +1,5 @@ il is ta -tar \ No newline at end of file +tar +u|& \ No newline at end of file diff --git a/resources/dictionaries/nb/ambiguous_expansions.txt b/resources/dictionaries/nb/ambiguous_expansions.txt index f959e6b2..58ef95fb 100644 --- a/resources/dictionaries/nb/ambiguous_expansions.txt +++ b/resources/dictionaries/nb/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& g h k diff --git a/resources/dictionaries/nb/stopwords.txt b/resources/dictionaries/nb/stopwords.txt index 519fb87f..c19c7b3b 100644 --- a/resources/dictionaries/nb/stopwords.txt +++ b/resources/dictionaries/nb/stopwords.txt @@ -23,7 +23,7 @@ naer nærmest naermest nest -og +og|& overfor over på diff --git a/resources/dictionaries/nl/ambiguous_expansions.txt b/resources/dictionaries/nl/ambiguous_expansions.txt index 27dd82a3..a1123e76 100644 --- a/resources/dictionaries/nl/ambiguous_expansions.txt +++ b/resources/dictionaries/nl/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& b h k diff --git a/resources/dictionaries/nl/stopwords.txt b/resources/dictionaries/nl/stopwords.txt index 3f5ff96b..b4c04a01 100644 --- a/resources/dictionaries/nl/stopwords.txt +++ b/resources/dictionaries/nl/stopwords.txt @@ -9,7 +9,7 @@ der die dit een -en +en|& hem het hoe diff --git a/resources/dictionaries/pl/ambiguous_expansions.txt b/resources/dictionaries/pl/ambiguous_expansions.txt index 72a9695d..8e4c26a3 100644 --- a/resources/dictionaries/pl/ambiguous_expansions.txt +++ b/resources/dictionaries/pl/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& d g k diff --git a/resources/dictionaries/pl/stopwords.txt b/resources/dictionaries/pl/stopwords.txt index 7d40e787..bd5a4769 100644 --- a/resources/dictionaries/pl/stopwords.txt +++ b/resources/dictionaries/pl/stopwords.txt @@ -1,3 +1,3 @@ -i +i|& na w \ No newline at end of file diff --git a/resources/dictionaries/pt/ambiguous_expansions.txt b/resources/dictionaries/pt/ambiguous_expansions.txt index bdf461fa..c6bbdd08 100644 --- a/resources/dictionaries/pt/ambiguous_expansions.txt +++ b/resources/dictionaries/pt/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& b d e diff --git a/resources/dictionaries/pt/stopwords.txt b/resources/dictionaries/pt/stopwords.txt index 7e0d5fe8..0bbaec62 100644 --- a/resources/dictionaries/pt/stopwords.txt +++ b/resources/dictionaries/pt/stopwords.txt @@ -14,7 +14,7 @@ debaixo defronte do dos -e +e|& em em frente de|em ft de entre diff --git a/resources/dictionaries/ro/ambiguous_expansions.txt b/resources/dictionaries/ro/ambiguous_expansions.txt index b9c9a54f..b663b501 100644 --- a/resources/dictionaries/ro/ambiguous_expansions.txt +++ b/resources/dictionaries/ro/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& e n s diff --git a/resources/dictionaries/ro/stopwords.txt b/resources/dictionaries/ro/stopwords.txt index 3e6e19ed..80195e69 100644 --- a/resources/dictionaries/ro/stopwords.txt +++ b/resources/dictionaries/ro/stopwords.txt @@ -1 +1,2 @@ +și|si|& cel \ No newline at end of file diff --git a/resources/dictionaries/ru/ambiguous_expansions.txt b/resources/dictionaries/ru/ambiguous_expansions.txt index 213e43c3..e38e90ed 100644 --- a/resources/dictionaries/ru/ambiguous_expansions.txt +++ b/resources/dictionaries/ru/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& д d г diff --git a/resources/dictionaries/ru/stopwords.txt b/resources/dictionaries/ru/stopwords.txt new file mode 100644 index 00000000..ae72c191 --- /dev/null +++ b/resources/dictionaries/ru/stopwords.txt @@ -0,0 +1 @@ +и|& \ No newline at end of file diff --git a/resources/dictionaries/sk/ambiguous_expansions.txt b/resources/dictionaries/sk/ambiguous_expansions.txt index 9eae731c..157fc657 100644 --- a/resources/dictionaries/sk/ambiguous_expansions.txt +++ b/resources/dictionaries/sk/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& c j s diff --git a/resources/dictionaries/sk/stopwords.txt b/resources/dictionaries/sk/stopwords.txt index 54e62505..aff4505e 100644 --- a/resources/dictionaries/sk/stopwords.txt +++ b/resources/dictionaries/sk/stopwords.txt @@ -1,4 +1,4 @@ -a +a|& bližko|blizko cez do diff --git a/resources/dictionaries/sl/ambiguous_expansions.txt b/resources/dictionaries/sl/ambiguous_expansions.txt index 9eae731c..157fc657 100644 --- a/resources/dictionaries/sl/ambiguous_expansions.txt +++ b/resources/dictionaries/sl/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& c j s diff --git a/resources/dictionaries/sl/stopwords.txt b/resources/dictionaries/sl/stopwords.txt index 8c6ca434..70fb5771 100644 --- a/resources/dictionaries/sl/stopwords.txt +++ b/resources/dictionaries/sl/stopwords.txt @@ -1,3 +1,4 @@ +in|& na ob pot diff --git a/resources/dictionaries/sr/ambiguous_expansions.txt b/resources/dictionaries/sr/ambiguous_expansions.txt index aee698c9..1fcdb71b 100644 --- a/resources/dictionaries/sr/ambiguous_expansions.txt +++ b/resources/dictionaries/sr/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& и i ј diff --git a/resources/dictionaries/sr/stopwords.txt b/resources/dictionaries/sr/stopwords.txt new file mode 100644 index 00000000..b25c373b --- /dev/null +++ b/resources/dictionaries/sr/stopwords.txt @@ -0,0 +1,2 @@ +и|& +i|& \ No newline at end of file diff --git a/resources/dictionaries/sv/ambiguous_expansions.txt b/resources/dictionaries/sv/ambiguous_expansions.txt index fb91fa69..7224b6d3 100644 --- a/resources/dictionaries/sv/ambiguous_expansions.txt +++ b/resources/dictionaries/sv/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& g l k diff --git a/resources/dictionaries/sv/stopwords.txt b/resources/dictionaries/sv/stopwords.txt index 70ae014f..a1658555 100644 --- a/resources/dictionaries/sv/stopwords.txt +++ b/resources/dictionaries/sv/stopwords.txt @@ -16,6 +16,7 @@ intill mellan motliggande närmast|naermast +och|& över|oever på|paa på andra sidan|paa andra sidan diff --git a/resources/dictionaries/tr/ambiguous_expansions.txt b/resources/dictionaries/tr/ambiguous_expansions.txt index 601b3955..fd7bdf78 100644 --- a/resources/dictionaries/tr/ambiguous_expansions.txt +++ b/resources/dictionaries/tr/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& b d g diff --git a/resources/dictionaries/tr/stopwords.txt b/resources/dictionaries/tr/stopwords.txt new file mode 100644 index 00000000..f1f6e736 --- /dev/null +++ b/resources/dictionaries/tr/stopwords.txt @@ -0,0 +1 @@ +ve|& \ No newline at end of file diff --git a/resources/dictionaries/uk/ambiguous_expansions.txt b/resources/dictionaries/uk/ambiguous_expansions.txt index 994b54a6..4ef2c05e 100644 --- a/resources/dictionaries/uk/ambiguous_expansions.txt +++ b/resources/dictionaries/uk/ambiguous_expansions.txt @@ -1,3 +1,4 @@ +& д d ш diff --git a/resources/dictionaries/uk/stopwords.txt b/resources/dictionaries/uk/stopwords.txt new file mode 100644 index 00000000..0401c8a8 --- /dev/null +++ b/resources/dictionaries/uk/stopwords.txt @@ -0,0 +1,2 @@ +і|& +i|& \ No newline at end of file From 665b7804227a411f09680dc17d86078000d478ba Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 11 Nov 2017 02:45:41 -0500 Subject: [PATCH 23/89] [utils] adding unicode_equals function in string_utils for testing equality of unicode char arrays --- src/string_utils.c | 13 +++++++++++++ src/string_utils.h | 1 + 2 files changed, 14 insertions(+) diff --git a/src/string_utils.c b/src/string_utils.c index 6b1b14ab..79083e09 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -390,6 +390,19 @@ uint32_array *unicode_codepoints(const char *str) { return a; } +bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + if (len1 != len2) return false; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + for (size_t i = 0; i < len1; i++) { + if (u1[i] != u2[i]) return false; + } + return true; +} + int utf8_compare_len(const char *str1, const char *str2, size_t len) { if (len == 0) return 0; diff --git a/src/string_utils.h b/src/string_utils.h index 29683f91..eb27651f 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -88,6 +88,7 @@ bool utf8_equal_ignore_separators(const char *str1, const char *str2); ssize_t utf8_len(const char *str, size_t len); uint32_array *unicode_codepoints(const char *str); +bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array); bool utf8_is_hyphen(int32_t ch); bool utf8_is_letter(int cat); From 751873e56bd9ccf9e4e91b0307d31acc7776d4f8 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 11 Nov 2017 03:07:39 -0500 Subject: [PATCH 24/89] [similarity] a *NEW* sequence alignment algorithm which builds on Smith-Waterman-Gotoh with affine gap penalties. Like Smith-Waterman, it performs a local alignment, and like the cost-only version of Gotoh's improvement, it needs O(mn) time and O(m) space (where m is the length of the longer string). However, this version of the algorithm stores and returns a breakdown of the number and specific types of edits it makes (matches, mismatches, gap opens, gap extensions, and transpositions) rather than rolling them up into a single cost, and without needing to return/compute the full alignment as in Needleman-Wunsch or Hirschberg's variant --- src/string_similarity.c | 285 ++++++++++++++++++++++++++++++++++++++++ src/string_similarity.h | 21 ++- 2 files changed, 304 insertions(+), 2 deletions(-) diff --git a/src/string_similarity.c b/src/string_similarity.c index 1043c708..bfdb1f11 100644 --- a/src/string_similarity.c +++ b/src/string_similarity.c @@ -1,6 +1,291 @@ #include "string_similarity.h" #include "string_utils.h" +#include + +static affine_gap_edits_t NULL_AFFINE_GAP_EDITS = { + .num_matches = 0, + .num_mismatches = 0, + .num_transpositions = 0, + .num_gap_opens = 0, + .num_gap_extensions = 0 +}; + +typedef enum { + AFFINE_CHAR_MATCH, + AFFINE_CHAR_MISMATCH, + AFFINE_TRANSPOSITION, + AFFINE_GAP_OPEN, + AFFINE_GAP_EXTEND +} affine_gap_op; + +static inline bool space_or_equivalent(int32_t c) { + int cat = utf8proc_category(c); + return utf8_is_whitespace(c) || utf8_is_hyphen(c) || utf8_is_punctuation(cat); +} + +affine_gap_edits_t affine_gap_distance_unicode_costs(uint32_array *u1_array, uint32_array *u2_array, size_t start_gap_cost, size_t extend_gap_cost, size_t match_cost, size_t mismatch_cost, size_t transpose_cost) { + if (u1_array->n < u2_array->n) { + uint32_array *tmp_array = u1_array; + u1_array = u2_array; + u2_array = tmp_array; + } + + size_t m = u1_array->n; + size_t n = u2_array->n; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + + affine_gap_edits_t edits = NULL_AFFINE_GAP_EDITS; + + if (unicode_equals(u1_array, u2_array)) { + edits.num_matches = n; + return edits; + } + + size_t num_bytes = (m + 1) * sizeof(size_t); + + size_t *C = malloc(num_bytes); + if (C == NULL) { + return NULL_AFFINE_GAP_EDITS; + } + + size_t *D = malloc(num_bytes); + if (D == NULL) { + free(C); + return NULL_AFFINE_GAP_EDITS; + } + + affine_gap_edits_t *E = malloc((m + 1) * sizeof(affine_gap_edits_t)); + if (E == NULL) { + free(C); + free(D); + return NULL_AFFINE_GAP_EDITS; + } + + affine_gap_edits_t *ED = malloc((m + 1) * sizeof(affine_gap_edits_t)); + if (ED == NULL) { + free(C); + free(D); + free(E); + return NULL_AFFINE_GAP_EDITS; + } + + size_t e = 0, c = 0, s = 0; + + C[0] = 0; + E[0] = NULL_AFFINE_GAP_EDITS; + size_t t = start_gap_cost; + + affine_gap_edits_t base_edits = NULL_AFFINE_GAP_EDITS; + base_edits.num_gap_opens++; + + for (size_t j = 1; j < m + 1; j++) { + t += extend_gap_cost; + C[j] = t; + D[j] = t + start_gap_cost; + base_edits.num_gap_extensions++; + E[j] = base_edits; + ED[j] = base_edits; + } + + t = start_gap_cost; + base_edits = NULL_AFFINE_GAP_EDITS; + base_edits.num_gap_opens++; + + affine_gap_edits_t current_edits = NULL_AFFINE_GAP_EDITS; + affine_gap_edits_t prev_char_edits = NULL_AFFINE_GAP_EDITS; + affine_gap_edits_t prev_row_prev_char_edits = NULL_AFFINE_GAP_EDITS; + + bool in_gap = false; + + for (size_t i = 1; i < n + 1; i++) { + // s = CC[0] + s = C[0]; + uint32_t c2 = u2[i - 1]; + // CC[0] = c = t = t + h + t += extend_gap_cost; + c = t; + C[0] = c; + + prev_row_prev_char_edits = E[0]; + base_edits.num_gap_extensions++; + prev_char_edits = base_edits; + E[0] = prev_char_edits; + + // e = t + g + e = t + start_gap_cost; + + affine_gap_op op = AFFINE_GAP_OPEN; + + ssize_t match_at = -1; + + size_t min_at = 0; + size_t min_cost = SIZE_MAX; + + for (size_t j = 1; j < m + 1; j++) { + // insertion + // e = min(e, c + g) + h + size_t min = e; + uint32_t c1 = u1[j - 1]; + + affine_gap_op insert_op = AFFINE_GAP_OPEN; + + if ((c + start_gap_cost) < min) { + min = c + start_gap_cost; + insert_op = AFFINE_GAP_OPEN; + } else { + insert_op = AFFINE_GAP_EXTEND; + } + + e = min + extend_gap_cost; + + // deletion + // DD[j] = min(DD[j], CC[j] + g) + h + + affine_gap_op delete_op = AFFINE_GAP_OPEN; + + min = D[j]; + affine_gap_edits_t delete_edits = ED[j]; + affine_gap_edits_t delete_edits_stored = delete_edits; + delete_op = AFFINE_GAP_OPEN; + if (C[j] + start_gap_cost < min) { + min = C[j] + start_gap_cost; + + delete_edits = delete_edits_stored = E[j]; + delete_edits_stored.num_gap_opens++; + } + + D[j] = min + extend_gap_cost; + delete_edits_stored.num_gap_extensions++; + ED[j] = delete_edits_stored; + + // Cost + // c = min(DD[j], e, s + w(a, b)) + + affine_gap_op current_op = delete_op; + + + min = D[j]; + + // Delete transition + current_edits = delete_edits; + + if (e < min) { + min = e; + // Insert transition + current_op = insert_op; + current_edits = prev_char_edits; + } + + bool both_separators = space_or_equivalent((int32_t)c1) && space_or_equivalent((int32_t)c2); + + bool is_transpose = false; + size_t w = c1 != c2 && !both_separators ? mismatch_cost : match_cost; + + if (c1 != c2 && j < m && utf8_is_letter(c2) && utf8_is_letter(c1) && c2 == u1[j] && i < n && c1 == u2[i]) { + w = transpose_cost; + is_transpose = true; + } + + if (s + w < min) { + min = s + w; + + // Match/mismatch/transpose transition + current_edits = prev_row_prev_char_edits; + + if ((c1 == c2 || both_separators) && !is_transpose) { + current_op = AFFINE_CHAR_MATCH; + } else if (!is_transpose) { + current_op = AFFINE_CHAR_MISMATCH; + } else if (is_transpose) { + current_op = AFFINE_TRANSPOSITION; + } + } + + if (current_op == AFFINE_CHAR_MATCH) { + current_edits.num_matches++; + } else if (current_op == AFFINE_CHAR_MISMATCH) { + current_edits.num_mismatches++; + } else if (current_op == AFFINE_GAP_EXTEND) { + current_edits.num_gap_extensions++; + } else if (current_op == AFFINE_GAP_OPEN) { + current_edits.num_gap_opens++; + current_edits.num_gap_extensions++; + } else if (current_op == AFFINE_TRANSPOSITION) { + current_edits.num_transpositions++; + } + + if (min < min_cost) { + op = current_op; + min_cost = min; + min_at = j; + } + + c = min; + s = C[j]; + C[j] = c; + + prev_char_edits = current_edits; + prev_row_prev_char_edits = E[j]; + E[j] = prev_char_edits; + + // In the case of a transposition, duplicate costs for next character and advance by 2 + if (current_op == AFFINE_TRANSPOSITION) { + E[j + 1] = E[j]; + C[j + 1] = C[j]; + j++; + } + } + + if (op == AFFINE_TRANSPOSITION) { + i++; + } + + } + + affine_gap_edits_t ret = E[m]; + free(C); + free(D); + free(E); + free(ED); + + return ret; + +} + +affine_gap_edits_t affine_gap_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) { + return affine_gap_distance_unicode_costs(u1_array, u2_array, DEFAULT_AFFINE_GAP_OPEN_COST, DEFAULT_AFFINE_GAP_EXTEND_COST, DEFAULT_AFFINE_GAP_MATCH_COST, DEFAULT_AFFINE_GAP_MISMATCH_COST, DEFAULT_AFFINE_GAP_TRANSPOSE_COST); +} + +affine_gap_edits_t affine_gap_distance_costs(char *s1, char *s2, size_t start_gap_cost, size_t extend_gap_cost, size_t match_cost, size_t mismatch_cost, size_t transpose_cost) { + if (s1 == NULL || s2 == NULL) return NULL_AFFINE_GAP_EDITS; + + uint32_array *u1_array = unicode_codepoints(s1); + if (u1_array == NULL) return NULL_AFFINE_GAP_EDITS; + + uint32_array *u2_array = unicode_codepoints(s2); + + if (u2_array == NULL) { + uint32_array_destroy(u1_array); + return NULL_AFFINE_GAP_EDITS; + } + + affine_gap_edits_t affine_gap = affine_gap_distance_unicode_costs(u1_array, u2_array, start_gap_cost, extend_gap_cost, match_cost, mismatch_cost, transpose_cost); + + uint32_array_destroy(u1_array); + uint32_array_destroy(u2_array); + + return affine_gap; +} + + +affine_gap_edits_t affine_gap_distance(char *s1, char *s2) { + return affine_gap_distance_costs(s1, s2, DEFAULT_AFFINE_GAP_OPEN_COST, DEFAULT_AFFINE_GAP_EXTEND_COST, DEFAULT_AFFINE_GAP_MATCH_COST, DEFAULT_AFFINE_GAP_MISMATCH_COST, DEFAULT_AFFINE_GAP_TRANSPOSE_COST); +} + + ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) { size_t len1 = u1_array->n; size_t len2 = u2_array->n; diff --git a/src/string_similarity.h b/src/string_similarity.h index 2fa1005b..eb66dd97 100644 --- a/src/string_similarity.h +++ b/src/string_similarity.h @@ -6,13 +6,30 @@ #include "collections.h" -#define DEFAULT_JARO_WINKLER_PREFIX_SCALE 0.1 -#define DEFAULT_JARO_WINKLER_BONUS_THRESHOLD 0.7 +#define DEFAULT_AFFINE_GAP_OPEN_COST 3 +#define DEFAULT_AFFINE_GAP_EXTEND_COST 2 +#define DEFAULT_AFFINE_GAP_MATCH_COST 0 +#define DEFAULT_AFFINE_GAP_MISMATCH_COST 6 +#define DEFAULT_AFFINE_GAP_TRANSPOSE_COST 4 + +typedef struct affine_gap_edits { + size_t num_matches; + size_t num_mismatches; + size_t num_transpositions; + size_t num_gap_opens; + size_t num_gap_extensions; +} affine_gap_edits_t; + +affine_gap_edits_t affine_gap_distance(char *s1, char *s2); +affine_gap_edits_t affine_gap_distance_unicode(uint32_array *u1_array, uint32_array *u2_array); ssize_t damerau_levenshtein_distance(const char *s1, const char *s2); ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost); ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost); +#define DEFAULT_JARO_WINKLER_PREFIX_SCALE 0.1 +#define DEFAULT_JARO_WINKLER_BONUS_THRESHOLD 0.7 + double jaro_distance(const char *s1, const char *s2); double jaro_distance_unicode(uint32_array *u1_array, uint32_array *u2_array); double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold); From b34e5783661990db8ba16ecd1f1ba68cff36c995 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 11 Nov 2017 04:02:28 -0500 Subject: [PATCH 25/89] [similarity] using new sequence alignment breakdown by operation to tell if any two words are an abbreviation. The loose variant requires that the alignment covers all characters in the shortest string, which matches things like Services vs. Svc, whereas the strict variant requires that either the shorter string is a prefix of the longer one (Inc and Incorporated) or that the two strings share both a prefix and a suffix (Dept and Department). Both variants require that the strings share at least the first letter in common. --- src/string_similarity.c | 77 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 71 insertions(+), 6 deletions(-) diff --git a/src/string_similarity.c b/src/string_similarity.c index bfdb1f11..806d28ea 100644 --- a/src/string_similarity.c +++ b/src/string_similarity.c @@ -245,13 +245,13 @@ affine_gap_edits_t affine_gap_distance_unicode_costs(uint32_array *u1_array, uin } - affine_gap_edits_t ret = E[m]; + edits = E[m]; free(C); free(D); free(E); free(ED); - return ret; + return edits; } @@ -259,7 +259,7 @@ affine_gap_edits_t affine_gap_distance_unicode(uint32_array *u1_array, uint32_ar return affine_gap_distance_unicode_costs(u1_array, u2_array, DEFAULT_AFFINE_GAP_OPEN_COST, DEFAULT_AFFINE_GAP_EXTEND_COST, DEFAULT_AFFINE_GAP_MATCH_COST, DEFAULT_AFFINE_GAP_MISMATCH_COST, DEFAULT_AFFINE_GAP_TRANSPOSE_COST); } -affine_gap_edits_t affine_gap_distance_costs(char *s1, char *s2, size_t start_gap_cost, size_t extend_gap_cost, size_t match_cost, size_t mismatch_cost, size_t transpose_cost) { +affine_gap_edits_t affine_gap_distance_costs(const char *s1, const char *s2, size_t start_gap_cost, size_t extend_gap_cost, size_t match_cost, size_t mismatch_cost, size_t transpose_cost) { if (s1 == NULL || s2 == NULL) return NULL_AFFINE_GAP_EDITS; uint32_array *u1_array = unicode_codepoints(s1); @@ -272,20 +272,85 @@ affine_gap_edits_t affine_gap_distance_costs(char *s1, char *s2, size_t start_ga return NULL_AFFINE_GAP_EDITS; } - affine_gap_edits_t affine_gap = affine_gap_distance_unicode_costs(u1_array, u2_array, start_gap_cost, extend_gap_cost, match_cost, mismatch_cost, transpose_cost); + affine_gap_edits_t edits = affine_gap_distance_unicode_costs(u1_array, u2_array, start_gap_cost, extend_gap_cost, match_cost, mismatch_cost, transpose_cost); uint32_array_destroy(u1_array); uint32_array_destroy(u2_array); - return affine_gap; + return edits; } -affine_gap_edits_t affine_gap_distance(char *s1, char *s2) { +affine_gap_edits_t affine_gap_distance(const char *s1, const char *s2) { return affine_gap_distance_costs(s1, s2, DEFAULT_AFFINE_GAP_OPEN_COST, DEFAULT_AFFINE_GAP_EXTEND_COST, DEFAULT_AFFINE_GAP_MATCH_COST, DEFAULT_AFFINE_GAP_MISMATCH_COST, DEFAULT_AFFINE_GAP_TRANSPOSE_COST); } +bool possible_abbreviation_unicode_with_edits(uint32_array *u1_array, uint32_array *u2_array, affine_gap_edits_t edits) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + if (len1 == 0 || len2 == 0) return false; + + size_t min_len = len1 < len2 ? len1 : len2; + + return edits.num_matches == min_len && u1_array->a[0] == u2_array->a[0]; +} + +inline bool possible_abbreviation_unicode(uint32_array *u1_array, uint32_array *u2_array) { + affine_gap_edits_t edits = affine_gap_distance_unicode(u1_array, u2_array); + + return possible_abbreviation_unicode_with_edits(u1_array, u2_array, edits); +} + + +bool possible_abbreviation_unicode_strict(uint32_array *u1_array, uint32_array *u2_array) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + if (len1 == 0 || len2 == 0) return false; + + size_t min_len = len1 < len2 ? len1 : len2; + + ssize_t prefix_len = unicode_common_prefix(u1_array, u2_array); + if (prefix_len == min_len) return true; + ssize_t suffix_len = unicode_common_suffix(u1_array, u2_array); + return suffix_len > 0 && prefix_len > 0 && possible_abbreviation_unicode(u1_array, u2_array); +} + +static bool possible_abbreviation_options(const char *s1, const char *s2, bool strict) { + if (s1 == NULL || s2 == NULL) return false; + + uint32_array *u1_array = unicode_codepoints(s1); + if (u1_array == NULL) return false; + + uint32_array *u2_array = unicode_codepoints(s2); + + if (u2_array == NULL) { + uint32_array_destroy(u1_array); + return false; + } + + bool abbrev = false; + if (!strict) { + abbrev = possible_abbreviation_unicode(u1_array, u2_array); + } else { + abbrev = possible_abbreviation_unicode_strict(u1_array, u2_array); + } + + uint32_array_destroy(u1_array); + uint32_array_destroy(u2_array); + + return abbrev; +} + +inline bool possible_abbreviation(const char *s1, const char *s2) { + return possible_abbreviation_options(s1, s2, false); +} + +inline bool possible_abbreviation_strict(const char *s1, const char *s2) { + return possible_abbreviation_options(s1, s2, true); +} + + ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) { size_t len1 = u1_array->n; size_t len2 = u2_array->n; From fbf88aee8828f779a3d6805872609c3fae6721c7 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 12 Nov 2017 04:48:26 -0500 Subject: [PATCH 26/89] [similarity] adding possible abbreviation functions to header, making everything const char * --- src/string_similarity.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/string_similarity.h b/src/string_similarity.h index eb66dd97..51c25208 100644 --- a/src/string_similarity.h +++ b/src/string_similarity.h @@ -20,9 +20,15 @@ typedef struct affine_gap_edits { size_t num_gap_extensions; } affine_gap_edits_t; -affine_gap_edits_t affine_gap_distance(char *s1, char *s2); +affine_gap_edits_t affine_gap_distance(const char *s1, const char *s2); affine_gap_edits_t affine_gap_distance_unicode(uint32_array *u1_array, uint32_array *u2_array); +bool possible_abbreviation(const char *s1, const char *s2); +bool possible_abbreviation_strict(const char *s1, const char *s2); +bool possible_abbreviation_unicode(uint32_array *u1_array, uint32_array *u2_array); +bool possible_abbreviation_unicode_strict(uint32_array *u1_array, uint32_array *u2_array); +bool possible_abbreviation_unicode_with_edits(uint32_array *u1_array, uint32_array *u2_array, affine_gap_edits_t edits); + ssize_t damerau_levenshtein_distance(const char *s1, const char *s2); ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost); ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost); From e27f5f1d70cf852f7e01bab36f0a1d8b9a730792 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 29 Nov 2017 18:10:10 -0500 Subject: [PATCH 27/89] [api] adding LIBPOSTAL_EXPORT to some of the new public API functions in this branch --- src/libpostal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libpostal.h b/src/libpostal.h index 28ae900e..e88d5625 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -186,7 +186,7 @@ typedef struct libpostal_token { uint16_t type; } libpostal_token_t; -libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n); +LIBPOSTAL_EXPORT libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n); // Normalize string options #define LIBPOSTAL_NORMALIZE_STRING_LATIN_ASCII 1 << 0 @@ -220,7 +220,7 @@ libpostal_token_t *libpostal_tokenize(char *input, bool whitespace, size_t *n); #define LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS_NUMERIC (LIBPOSTAL_NORMALIZE_DEFAULT_TOKEN_OPTIONS | LIBPOSTAL_NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC) -char *libpostal_normalize_string(char *input, uint64_t options); +LIBPOSTAL_EXPORT char *libpostal_normalize_string(char *input, uint64_t options); typedef struct libpostal_normalized_token { From cfa5b1ce42ff908a26aecab35b7c22335857d3b6 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 4 Dec 2017 15:21:09 -0500 Subject: [PATCH 28/89] [similarity] adding a stopword-aware acronym alignment method for matching U.N. with United Nations, Museum of Modern Art with MoMA, as well as things like University of California - Los Angeles with UCLA. All of these should work across languages, including non-Latin character sets like Cyrllic (but not ideograms as the concept doesn't make as much sense there). Skipping tokens like "of" or "the" depends only on the stopwords dictionary being defined for a given language. --- src/acronyms.c | 140 +++++++++++++++++++++++++++++++++++++++ src/acronyms.h | 15 +++++ src/address_dictionary.c | 18 +++++ src/address_dictionary.h | 1 + src/string_utils.c | 8 ++- src/string_utils.h | 1 + 6 files changed, 181 insertions(+), 2 deletions(-) create mode 100644 src/acronyms.c create mode 100644 src/acronyms.h diff --git a/src/acronyms.c b/src/acronyms.c new file mode 100644 index 00000000..ed91b5a6 --- /dev/null +++ b/src/acronyms.c @@ -0,0 +1,140 @@ +#include "acronyms.h" + +phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages) { + if (s1 == NULL || tokens1 == NULL || s2 == NULL || tokens2 == NULL) { + return NULL; + } + + size_t len1 = tokens1->n; + size_t len2 = tokens2->n; + if (len1 == 0 || len2 == 0 || len1 == len2) return NULL; + + if (len1 > len2) { + const char *tmp_s = s1; + s1 = s2; + s2 = tmp_s; + + token_array *tmp_t = tokens1; + tokens1 = tokens2; + tokens2 = tmp_t; + + size_t tmp_l = len1; + len1 = len2; + len2 = tmp_l; + } + + phrase_array *alignments = NULL; + + token_t *t1 = tokens1->a; + token_t *t2 = tokens2->a; + + uint32_array *stopwords_array = uint32_array_new_zeros(len2); + + uint32_t *stopwords = stopwords_array->a; + + for (size_t l = 0; l < num_languages; l++) { + char *lang = languages[l]; + phrase_array *lang_phrases = search_address_dictionaries_tokens((char *)s2, tokens2, lang); + + if (lang_phrases != NULL) { + size_t num_lang_phrases = lang_phrases->n; + for (size_t p = 0; p < num_lang_phrases; p++) { + phrase_t phrase = lang_phrases->a[p]; + + if (address_phrase_in_dictionary(phrase, DICTIONARY_STOPWORD)) { + for (size_t stop_idx = phrase.start; stop_idx < phrase.start + phrase.len; stop_idx++) { + stopwords[stop_idx] = 1; + } + } + } + phrase_array_destroy(lang_phrases); + } + } + + ssize_t acronym_start = -1; + ssize_t acronym_token_pos = -1; + + uint8_t *ptr1 = (uint8_t *)s1; + uint8_t *ptr2 = (uint8_t *)s2; + + int32_t c1, c2; + ssize_t c1_len; + ssize_t c2_len; + + size_t t2_consumed = 0; + + for (size_t i = 0; i < len1; i++) { + token_t ti = t1[i]; + + c1_len = utf8proc_iterate(ptr1 + ti.offset, ti.len, &c1); + if (c1_len <= 0 || c1 == 0) { + break; + } + + // Make sure it's a non-ideographic word. Single letter abbreviations will be captured by other methods + if (!is_word_token(ti.type) || is_ideographic(ti.type) || ti.len == c1_len) { + acronym_token_pos = -1; + continue; + } + + size_t ti_pos = 0; + + for (size_t j = t2_consumed; j < len2; j++) { + token_t tj = t2[j]; + c2_len = utf8proc_iterate(ptr2 + tj.offset, tj.len, &c2); + if (c2_len <= 0) { + break; + } + + if (utf8proc_tolower(c1) == utf8proc_tolower(c2)) { + ti_pos += c1_len; + if (acronym_start < 0) { + acronym_start = j; + acronym_token_pos = 0; + } + acronym_token_pos++; + c1_len = utf8proc_iterate(ptr1 + ti.offset + ti_pos, ti.len, &c1); + } else if (stopwords[j] && acronym_token_pos > 0) { + continue; + } else if (is_punctuation(tj.type) && acronym_token_pos > 0) { + continue; + } else if (ti_pos < ti.len) { + acronym_token_pos = -1; + acronym_start = -1; + ti_pos = 0; + continue; + } + + if ((utf8_is_period(c1) || utf8_is_hyphen(c1)) && ti_pos < ti.len) { + ti_pos += c1_len; + if (ti_pos < ti.len) { + c1_len = utf8proc_iterate(ptr1 + ti.offset + ti_pos, ti.len, &c1); + if (c1_len <= 0 || c1 == 0) { + break; + } + } + } + + if (ti_pos == ti.len) { + phrase_t phrase = (phrase_t){acronym_start, j - acronym_start + 1, i}; + // got alignment + if (alignments == NULL) { + alignments = phrase_array_new(); + } + + phrase_array_push(alignments, phrase); + + ti_pos = 0; + acronym_token_pos = -1; + acronym_start = -1; + } + } + + } + + uint32_array_destroy(stopwords_array); + + return alignments; +} + + diff --git a/src/acronyms.h b/src/acronyms.h new file mode 100644 index 00000000..5c61002e --- /dev/null +++ b/src/acronyms.h @@ -0,0 +1,15 @@ +#ifndef ACRONYMS_H +#define ACRONYMS_H + +#include +#include + +#include "address_dictionary.h" +#include "collections.h" +#include "tokens.h" +#include "token_types.h" + +phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages); + + +#endif \ No newline at end of file diff --git a/src/address_dictionary.c b/src/address_dictionary.c index fd3fe471..957306b8 100644 --- a/src/address_dictionary.c +++ b/src/address_dictionary.c @@ -35,6 +35,24 @@ inline bool address_expansion_in_dictionary(address_expansion_t expansion, uint1 } +bool address_phrase_in_dictionary(phrase_t phrase, uint16_t dictionary_id) { + address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + if (expansions == NULL) return false; + + address_expansion_t *expansions_array = expansions->a; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions_array[i]; + if (address_expansion_in_dictionary(expansion, dictionary_id)) { + return true; + } + } + return false; +} + int32_t address_dictionary_next_canonical_index(void) { if (address_dict == NULL || address_dict->canonical == NULL) { diff --git a/src/address_dictionary.h b/src/address_dictionary.h index 1a80ed6c..0ee7934f 100644 --- a/src/address_dictionary.h +++ b/src/address_dictionary.h @@ -69,6 +69,7 @@ phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang); address_expansion_value_t *address_dictionary_get_expansions(uint32_t i); bool address_expansion_in_dictionary(address_expansion_t expansion, uint16_t dictionary_id); +bool address_phrase_in_dictionary(phrase_t phrase, uint16_t dictionary_id); char *address_dictionary_get_canonical(uint32_t index); int32_t address_dictionary_next_canonical_index(void); bool address_dictionary_add_canonical(char *canonical); diff --git a/src/string_utils.c b/src/string_utils.c index fcd35d74..567c2213 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -314,6 +314,12 @@ inline bool utf8_is_hyphen(int32_t ch) { return cat == UTF8PROC_CATEGORY_PD || ch == 0x2212; } +#define PERIOD_CODEPOINT 46 + +inline bool utf8_is_period(int32_t codepoint) { + return codepoint == PERIOD_CODEPOINT; +} + inline bool utf8_is_punctuation(int cat) { return cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PE \ || cat == UTF8PROC_CATEGORY_PF || cat == UTF8PROC_CATEGORY_PI \ @@ -703,8 +709,6 @@ ssize_t string_next_codepoint(char *str, uint32_t codepoint) { return string_next_codepoint_len(str, codepoint, strlen(str)); } -#define PERIOD_CODEPOINT 46 - ssize_t string_next_period_len(char *str, size_t len) { return string_next_codepoint_len(str, PERIOD_CODEPOINT, len); } diff --git a/src/string_utils.h b/src/string_utils.h index eb27651f..86a018d8 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -91,6 +91,7 @@ uint32_array *unicode_codepoints(const char *str); bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array); bool utf8_is_hyphen(int32_t ch); +bool utf8_is_period(int32_t ch); bool utf8_is_letter(int cat); bool utf8_is_number(int cat); bool utf8_is_digit(int cat); From 55ba627c3cf6dc32a108b03d5b5f2c6880bfe8f8 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 8 Dec 2017 14:27:23 -0500 Subject: [PATCH 29/89] [similarity] needed to add utf8proc_category and invert the indices for counting transposes in affine gap --- src/string_similarity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/string_similarity.c b/src/string_similarity.c index 806d28ea..953dbadd 100644 --- a/src/string_similarity.c +++ b/src/string_similarity.c @@ -184,7 +184,7 @@ affine_gap_edits_t affine_gap_distance_unicode_costs(uint32_array *u1_array, uin bool is_transpose = false; size_t w = c1 != c2 && !both_separators ? mismatch_cost : match_cost; - if (c1 != c2 && j < m && utf8_is_letter(c2) && utf8_is_letter(c1) && c2 == u1[j] && i < n && c1 == u2[i]) { + if (c1 != c2 && utf8_is_letter(utf8proc_category(c2)) && utf8_is_letter(utf8proc_category(c1)) && i < n && c2 == u1[i] && j < m && c1 == u2[j]) { w = transpose_cost; is_transpose = true; } From e4e84f0147a1ebd588eb29959af7773527857912 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 8 Dec 2017 14:28:30 -0500 Subject: [PATCH 30/89] [utils] adding unicode_common_prefix/unicode_common_suffix, string_hyphen_prefix_len and string_hyphen_suffix_len to string_utils --- src/string_utils.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++ src/string_utils.h | 5 ++++ 2 files changed, 68 insertions(+) diff --git a/src/string_utils.c b/src/string_utils.c index 567c2213..7045dd25 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -410,6 +410,47 @@ bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array) { return true; } +size_t unicode_common_prefix(uint32_array *u1_array, uint32_array *u2_array) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + + size_t min_len = len1 <= len2 ? len1 : len2; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + size_t common_prefix = 0; + + for (size_t i = 0; i < min_len; i++) { + if (u1[i] == u2[i]) { + common_prefix++; + } else { + break; + } + } + return common_prefix; +} + +size_t unicode_common_suffix(uint32_array *u1_array, uint32_array *u2_array) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + + size_t min_len = len1 <= len2 ? len1 : len2; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + size_t common_suffix = 0; + + for (size_t i = 0; i < min_len; i++) { + if (u1[len1 - i - 1] == u2[len2 - i - 1]) { + common_suffix++; + } else { + break; + } + } + return common_suffix; +} + + int utf8_compare_len(const char *str1, const char *str2, size_t len) { if (len == 0) return 0; @@ -749,6 +790,28 @@ size_t string_right_spaces_len(char *str, size_t len) { } +inline size_t string_hyphen_prefix_len(char *str, size_t len) { + // Strip beginning hyphens + int32_t unichr; + uint8_t *ptr = (uint8_t *)str; + ssize_t char_len = utf8proc_iterate(ptr, len, &unichr); + if (utf8_is_hyphen(unichr)) { + return (size_t)char_len; + } + return 0; +} + +inline size_t string_hyphen_suffix_len(char *str, size_t len) { + // Strip ending hyphens + int32_t unichr; + uint8_t *ptr = (uint8_t *)str; + ssize_t char_len = utf8proc_iterate_reversed(ptr, len, &unichr); + if (utf8_is_hyphen(unichr)) { + return (size_t)char_len; + } + return 0; +} + size_t string_left_spaces_len(char *str, size_t len) { size_t spaces = 0; diff --git a/src/string_utils.h b/src/string_utils.h index 86a018d8..a94f1d93 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -89,6 +89,8 @@ ssize_t utf8_len(const char *str, size_t len); uint32_array *unicode_codepoints(const char *str); bool unicode_equals(uint32_array *u1_array, uint32_array *u2_array); +size_t unicode_common_prefix(uint32_array *u1_array, uint32_array *u2_array); +size_t unicode_common_suffix(uint32_array *u1_array, uint32_array *u2_array); bool utf8_is_hyphen(int32_t ch); bool utf8_is_period(int32_t ch); @@ -119,6 +121,9 @@ bool string_contains_period(char *str); char *string_trim(char *str); +size_t string_hyphen_prefix_len(char *str, size_t len); +size_t string_hyphen_suffix_len(char *str, size_t len); + /* char_array is a dynamic character array defined in collections.h but has a few additional methods related to string manipulation. From 8968a6c9667722e3a3f2b49a5b9d7f69a8703e28 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 8 Dec 2017 16:26:00 -0500 Subject: [PATCH 31/89] [expand] moving expand to its own module so the internal methods can be exposed, calling from libpostal.c --- src/Makefile.am | 2 +- src/expand.c | 1086 ++++++++++++++++++++++++++++++++++++++++++++++ src/expand.h | 52 +++ src/libpostal.c | 1103 +---------------------------------------------- 4 files changed, 1143 insertions(+), 1100 deletions(-) create mode 100644 src/expand.c create mode 100644 src/expand.h diff --git a/src/Makefile.am b/src/Makefile.am index 6a13fce6..e76a3a1e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -12,7 +12,7 @@ DEFAULT_INCLUDES = -I.. -I/usr/local/include CFLAGS = lib_LTLIBRARIES = libpostal.la -libpostal_la_SOURCES = strndup.c libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c +libpostal_la_SOURCES = strndup.c libpostal.c expand.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS) libpostal_la_CFLAGS = $(CFLAGS_O2) -D LIBPOSTAL_EXPORTS libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@ -no-undefined diff --git a/src/expand.c b/src/expand.c new file mode 100644 index 00000000..709c35ac --- /dev/null +++ b/src/expand.c @@ -0,0 +1,1086 @@ +#include + +#include "expand.h" + +#include "log/log.h" + +#include "address_dictionary.h" +#include "collections.h" +#include "constants.h" +#include "language_classifier.h" +#include "numex.h" +#include "normalize.h" +#include "scanner.h" +#include "string_utils.h" +#include "token_types.h" +#include "transliterate.h" + + +#define DEFAULT_KEY_LEN 32 + +#define EXCESSIVE_PERMUTATIONS 100 + +inline uint64_t get_normalize_token_options(libpostal_normalize_options_t options) { + uint64_t normalize_token_options = 0; + + normalize_token_options |= options.delete_final_periods ? NORMALIZE_TOKEN_DELETE_FINAL_PERIOD : 0; + normalize_token_options |= options.delete_acronym_periods ? NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS : 0; + normalize_token_options |= options.drop_english_possessives ? NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES : 0; + normalize_token_options |= options.delete_apostrophes ? NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE : 0; + + return normalize_token_options; +} + +inline uint64_t get_normalize_string_options(libpostal_normalize_options_t options) { + uint64_t normalize_string_options = 0; + normalize_string_options |= options.transliterate ? NORMALIZE_STRING_TRANSLITERATE : 0; + normalize_string_options |= options.latin_ascii ? NORMALIZE_STRING_LATIN_ASCII : 0; + normalize_string_options |= options.decompose ? NORMALIZE_STRING_DECOMPOSE : 0; + normalize_string_options |= options.strip_accents ? NORMALIZE_STRING_STRIP_ACCENTS : 0; + normalize_string_options |= options.lowercase ? NORMALIZE_STRING_LOWERCASE : 0; + normalize_string_options |= options.trim_string ? NORMALIZE_STRING_TRIM : 0; + normalize_string_options |= options.expand_numex ? NORMALIZE_STRING_REPLACE_NUMEX : 0; + + return normalize_string_options; +} + +void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) { + + uint64_t normalize_token_options = get_normalize_token_options(options); + + if (token.type != WHITESPACE ) { + + bool contains_hyphen = string_contains_hyphen_len(str + token.offset, token.len); + + if (!contains_hyphen || token.type == HYPHEN) { + log_debug("str = %s, token = {%zu, %zu, %u}\n", str, token.offset, token.len, token.type); + normalize_token(strings, str, token, normalize_token_options); + } else if (is_word_token(token.type)) { + + size_t prefix_hyphen_len = string_hyphen_prefix_len(str + token.offset, token.len); + if (prefix_hyphen_len > 0) { + token.offset += prefix_hyphen_len; + } + + size_t suffix_hyphen_len = string_hyphen_suffix_len(str + token.offset, token.len); + if (suffix_hyphen_len > 0) { + token.len -= suffix_hyphen_len; + } + + normalize_token(strings, str, token, normalize_token_options); + + if (options.replace_word_hyphens) { + normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS; + normalize_token(strings, str, token, normalize_token_options); + normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS; + } + + if (options.delete_word_hyphens) { + normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS; + normalize_token(strings, str, token, normalize_token_options); + normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS; + } + + } else if (is_numeric_token(token.type)) { + normalize_token(strings, str, token, normalize_token_options); + + if (options.replace_word_hyphens || options.replace_numeric_hyphens) { + if (options.replace_word_hyphens) { + normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS; + } + + if (options.replace_numeric_hyphens) { + normalize_token_options |= NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS; + } + + normalize_token(strings, str, token, normalize_token_options); + normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS | NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS; + } + + if (options.delete_numeric_hyphens) { + normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS; + normalize_token(strings, str, token, normalize_token_options); + normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS; + } + } + + if (is_numeric_token(token.type) && options.split_alpha_from_numeric && numeric_starts_with_alpha(str, token)) { + normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; + normalize_token(strings, str, token, normalize_token_options); + normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; + } + } else { + cstring_array_add_string(strings, " "); + } +} + +void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options) { + cstring_array_add_string(strings, str); + + if (options.roman_numerals) { + char *numex_replaced = replace_numeric_expressions(str, LATIN_LANGUAGE_CODE); + if (numex_replaced != NULL) { + cstring_array_add_string(strings, numex_replaced); + free(numex_replaced); + } + + } + +} + + + +address_expansion_array *valid_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options) { + uint32_t expansion_index = phrase.data; + address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + if (value != NULL && value->components & options.address_components) { + return value->expansions; + } + + return NULL; +} + +inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options) { + if (expansion.canonical_index != NULL_CANONICAL_INDEX) { + char *canonical = address_dictionary_get_canonical(expansion.canonical_index); + uint64_t normalize_string_options = get_normalize_string_options(options); + char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options); + canonical = canonical_normalized != NULL ? canonical_normalized : canonical; + + char_array_cat(key, canonical); + if (canonical_normalized != NULL) { + free(canonical_normalized); + } + } else { + char_array_cat_len(key, str + token.offset + phrase.start, phrase.len); + } +} + + +bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options, bool with_period) { + cstring_array *strings = tree->strings; + + size_t skip_period = with_period ? 1 : 0; + + bool have_suffix = suffix.len > 0 && suffix.len < token.len; + bool have_prefix = prefix.len > 0 && prefix.len + with_period < token.len; + + if (!have_suffix && !have_prefix) { + return false; + } + + address_expansion_array *prefix_expansions = NULL; + address_expansion_array *suffix_expansions = NULL; + + address_expansion_t prefix_expansion; + address_expansion_t suffix_expansion; + + char *expansion; + + size_t num_strings = 0; + char *root_word = NULL; + size_t root_len; + token_t root_token; + cstring_array *root_strings = NULL; + int add_space = 0; + int spaces = 0; + + size_t prefix_start, prefix_end, root_end, suffix_start; + + if (have_prefix) { + prefix_expansions = valid_affix_expansions(prefix, options); + if (prefix_expansions == NULL) have_prefix = false; + } + + if (have_suffix) { + suffix_expansions = valid_affix_expansions(suffix, options); + if (suffix_expansions == NULL) have_suffix = false; + } + + if (!have_suffix && !have_prefix) { + return false; + } + + char_array *key = char_array_new_size(token.len); + + if (have_prefix && have_suffix) { + for (size_t i = 0; i < prefix_expansions->n; i++) { + prefix_expansion = prefix_expansions->a[i]; + char_array_clear(key); + + cat_affix_expansion(key, str, prefix_expansion, token, prefix, options); + prefix_start = key->n - 1; + + add_space = (int)prefix_expansion.separable || with_period; + if (prefix.len + skip_period + suffix.len < token.len && !prefix_expansion.separable) { + add_space = suffix_expansion.separable || with_period; + } + + for (spaces = skip_period; spaces <= add_space; spaces++) { + key->n = prefix_start; + if (spaces) { + char_array_cat(key, " "); + } + + prefix_end = key->n; + + if (prefix.len + skip_period + suffix.len < token.len) { + root_len = token.len - suffix.len - prefix.len - skip_period; + size_t root_start = token.offset + prefix.len + skip_period; + size_t prefix_hyphen_len = string_hyphen_prefix_len(str + root_start, root_len); + root_start += prefix_hyphen_len; + root_len -= prefix_hyphen_len; + size_t suffix_hyphen_len = string_hyphen_suffix_len(str + root_start, root_len); + root_len -= suffix_hyphen_len; + root_token = (token_t){root_start, root_len, token.type}; + root_strings = cstring_array_new_size(root_len); + add_normalized_strings_token(root_strings, str, root_token, options); + num_strings = cstring_array_num_strings(root_strings); + + for (size_t j = 0; j < num_strings; j++) { + key->n = prefix_end; + root_word = cstring_array_get_string(root_strings, j); + char_array_cat(key, root_word); + root_end = key->n - 1; + + for (size_t k = 0; k < suffix_expansions->n; k++) { + key->n = root_end; + suffix_expansion = suffix_expansions->a[k]; + + int add_suffix_space = suffix_expansion.separable; + + suffix_start = key->n; + for (int suffix_spaces = skip_period; suffix_spaces <= add_suffix_space; suffix_spaces++) { + key->n = suffix_start; + if (suffix_spaces) { + char_array_cat(key, " "); + } + + cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); + + expansion = char_array_get_string(key); + cstring_array_add_string(strings, expansion); + + } + + + } + } + + cstring_array_destroy(root_strings); + root_strings = NULL; + + } else { + for (size_t j = 0; j < suffix_expansions->n; j++) { + key->n = prefix_end - skip_period; + suffix_expansion = suffix_expansions->a[j]; + + cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); + + expansion = char_array_get_string(key); + cstring_array_add_string(tree->strings, expansion); + } + } + } + + } + } else if (have_suffix) { + log_debug("suffix.start=%" PRId32 "\n", suffix.start); + root_len = suffix.start; + root_token = (token_t){token.offset, root_len, token.type}; + log_debug("root_len=%zu\n", root_len); + log_debug("root_token = {%zu, %zu, %u}\n", root_token.offset, root_token.len, root_token.type); + + root_strings = cstring_array_new_size(root_len + 1); + add_normalized_strings_token(root_strings, str, root_token, options); + num_strings = cstring_array_num_strings(root_strings); + + log_debug("num_strings = %zu\n", num_strings); + + for (size_t j = 0; j < num_strings; j++) { + char_array_clear(key); + root_word = cstring_array_get_string(root_strings, j); + log_debug("root_word=%s\n", root_word); + char_array_cat(key, root_word); + root_end = key->n - 1; + + for (size_t k = 0; k < suffix_expansions->n; k++) { + key->n = root_end; + suffix_expansion = suffix_expansions->a[k]; + + add_space = (suffix_expansion.separable || with_period) && suffix.len < token.len; + suffix_start = key->n; + + for (int spaces = skip_period; spaces <= add_space; spaces++) { + key->n = suffix_start; + if (spaces) { + char_array_cat(key, " "); + } + + cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); + + expansion = char_array_get_string(key); + cstring_array_add_string(tree->strings, expansion); + } + } + } + } else if (have_prefix) { + if (prefix.len + skip_period <= token.len) { + root_len = token.len - prefix.len - skip_period; + size_t root_start = token.offset + prefix.len + skip_period; + size_t prefix_hyphen_len = string_hyphen_prefix_len(str + root_start, root_len); + root_start += prefix_hyphen_len; + root_len -= prefix_hyphen_len; + size_t suffix_hyphen_len = string_hyphen_suffix_len(str + root_start, root_len); + root_len -= suffix_hyphen_len; + root_token = (token_t){root_start, root_len, token.type}; + root_strings = cstring_array_new_size(root_len); + add_normalized_strings_token(root_strings, str, root_token, options); + num_strings = cstring_array_num_strings(root_strings); + + } else { + root_strings = cstring_array_new_size(token.len); + add_normalized_strings_token(root_strings, str, token, options); + num_strings = cstring_array_num_strings(root_strings); + + for (size_t k = 0; k < num_strings; k++) { + root_word = cstring_array_get_string(root_strings, k); + cstring_array_add_string(tree->strings, root_word); + } + + char_array_destroy(key); + cstring_array_destroy(root_strings); + return false; + + } + + for (size_t j = 0; j < prefix_expansions->n; j++) { + char_array_clear(key); + prefix_expansion = prefix_expansions->a[j]; + + cat_affix_expansion(key, str, prefix_expansion, token, prefix, options); + prefix_end = key->n - 1; + + add_space = (prefix_expansion.separable || with_period) && prefix.len + skip_period < token.len; + for (int spaces = skip_period; spaces <= add_space; spaces++) { + key->n = prefix_end; + if (spaces) { + char_array_cat(key, " "); + } + size_t prefix_space_len = key->n - spaces; + for (size_t k = 0; k < num_strings; k++) { + key->n = prefix_space_len; + root_word = cstring_array_get_string(root_strings, k); + char_array_cat(key, root_word); + + expansion = char_array_get_string(key); + cstring_array_add_string(tree->strings, expansion); + } + + } + } + } + + char_array_destroy(key); + + if (root_strings != NULL) { + cstring_array_destroy(root_strings); + } + + return true; + +} + +inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { + phrase_t suffix = search_address_dictionaries_suffix(str + token.offset, token.len, lang); + + phrase_t prefix = search_address_dictionaries_prefix(str + token.offset, token.len, lang); + + if ((suffix.len == 0 && prefix.len == 0)) return false; + + bool with_period = false; + + return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period); +} + +inline bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { + ssize_t first_period_index = string_next_period_len(str + token.offset, token.len); + if (first_period_index > 0) { + ssize_t next_period_index = string_next_period_len(str + token.offset + first_period_index + 1, token.len - first_period_index - 1); + // Token contains only one period or one + a final period + if (next_period_index < 0 || next_period_index == token.len - 1) { + phrase_t prefix = search_address_dictionaries_substring(str + token.offset, first_period_index, lang); + + phrase_t suffix = search_address_dictionaries_substring(str + token.offset + first_period_index + 1, token.len - first_period_index - 1, lang); + if (suffix.len > 0) { + suffix.start = first_period_index + 1; + } + + if (suffix.len == 0 && prefix.len == 0) return false; + + bool with_period = true; + + return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period); + } else { + return false; + } + } else { + return false; + } +} + +bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options) { + bool have_period_affixes = false; + if (string_contains_period_len(str + token.offset, token.len)) { + for (size_t l = 0; l < options.num_languages; l++) { + char *lang = options.languages[l]; + if (expand_affixes_period(tree, str, lang, token, options)) { + have_period_affixes = true; + break; + } + } + } + + if (!have_period_affixes) { + string_tree_add_string_len(tree, str + token.offset, token.len); + } + + return have_period_affixes; +} + + +string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t options) { + char_array *key = NULL; + + log_debug("input=%s\n", str); + token_array *tokens = tokenize_keep_whitespace(str); + + if (tokens == NULL) { + return NULL; + } + + size_t len = strlen(str); + + log_debug("tokenized, num tokens=%zu\n", tokens->n); + + bool last_was_punctuation = false; + + phrase_language_array *phrases = NULL; + phrase_array *lang_phrases = NULL; + + + for (size_t i = 0; i < options.num_languages; i++) { + char *lang = options.languages[i]; + log_debug("lang=%s\n", lang); + + lang_phrases = search_address_dictionaries_tokens(str, tokens, lang); + + if (lang_phrases == NULL) { + log_debug("lang_phrases NULL\n"); + continue; + } + + log_debug("lang_phrases->n = %zu\n", lang_phrases->n); + + phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); + + for (size_t j = 0; j < lang_phrases->n; j++) { + phrase_t p = lang_phrases->a[j]; + log_debug("lang=%s, (%d, %d)\n", lang, p.start, p.len); + phrase_language_array_push(phrases, (phrase_language_t){lang, p}); + } + + phrase_array_destroy(lang_phrases); + } + + + lang_phrases = search_address_dictionaries_tokens(str, tokens, ALL_LANGUAGES); + if (lang_phrases != NULL) { + phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); + + for (size_t j = 0; j < lang_phrases->n; j++) { + phrase_t p = lang_phrases->a[j]; + phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p}); + } + phrase_array_destroy(lang_phrases); + + } + + string_tree_t *tree = string_tree_new_size(len); + + bool last_added_was_whitespace = false; + + uint64_t normalize_string_options = get_normalize_string_options(options); + + if (phrases != NULL) { + log_debug("phrases not NULL, n=%zu\n", phrases->n); + ks_introsort(phrase_language_array, phrases->n, phrases->a); + + phrase_language_t phrase_lang; + + size_t start = 0; + size_t end = 0; + + phrase_t phrase = NULL_PHRASE; + phrase_t prev_phrase = NULL_PHRASE; + + key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN); + + for (size_t i = 0; i < phrases->n; i++) { + phrase_lang = phrases->a[i]; + + phrase = phrase_lang.phrase; + + log_debug("phrase.start=%d, phrase.len=%d, lang=%s, prev_phrase.start=%d, prev_phrase.len=%d\n", phrase.start, phrase.len, phrase_lang.language, prev_phrase.start, prev_phrase.len); + + if ((phrase.start > prev_phrase.start && phrase.start < prev_phrase.start + prev_phrase.len) || (phrase.start == prev_phrase.start && i > 0 && phrase.len < prev_phrase.len)) { + log_debug("continuing\n"); + continue; + } + + char_array_clear(key); + + char_array_cat(key, phrase_lang.language); + char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); + + size_t namespace_len = key->n; + + end = phrase.start; + + log_debug("start=%zu, end=%zu\n", start, end); + for (size_t j = start; j < end; j++) { + log_debug("Adding token %zu\n", j); + token_t token = tokens->a[j]; + if (is_punctuation(token.type)) { + last_was_punctuation = true; + continue; + } + + if (token.type != WHITESPACE) { + if (phrase.start > 0 && last_was_punctuation && !last_added_was_whitespace) { + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + } + log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); + + bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); + last_added_was_whitespace = false; + } else if (!last_added_was_whitespace) { + log_debug("Adding pre-phrase whitespace\n"); + last_added_was_whitespace = true; + string_tree_add_string(tree, " "); + } else { + continue; + } + + last_was_punctuation = false; + string_tree_finalize_token(tree); + } + + if (phrase.start > 0 && start < end) { + token_t prev_token = tokens->a[phrase.start - 1]; + log_debug("last_added_was_whitespace=%d\n", last_added_was_whitespace); + if (!last_added_was_whitespace && phrase.start - 1 > 0 && (!is_ideographic(prev_token.type) || last_was_punctuation)) { + log_debug("Adding space III\n"); + string_tree_add_string(tree, " "); + last_added_was_whitespace = true; + string_tree_finalize_token(tree); + } + } + + uint32_t expansion_index = phrase.data; + address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + + token_t token; + + size_t added_expansions = 0; + if ((value->components & options.address_components) > 0) { + key->n = namespace_len; + for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { + token = tokens->a[j]; + if (token.type != WHITESPACE) { + char_array_cat_len(key, str + token.offset, token.len); + last_added_was_whitespace = false; + } else { + char_array_cat(key, " "); + last_added_was_whitespace = true; + } + } + + char *key_str = char_array_get_string(key); + log_debug("key_str=%s\n", key_str); + address_expansion_array *expansions = value->expansions; + + if (expansions != NULL) { + for (size_t j = 0; j < expansions->n; j++) { + address_expansion_t expansion = expansions->a[j]; + + if ((expansion.address_components & options.address_components) == 0 && !address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION)) { + continue; + } + + if (expansion.canonical_index != NULL_CANONICAL_INDEX) { + char *canonical = address_dictionary_get_canonical(expansion.canonical_index); + char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options); + + canonical = canonical_normalized != NULL ? canonical_normalized : canonical; + + + if (phrase.start + phrase.len < tokens->n - 1) { + token_t next_token = tokens->a[phrase.start + phrase.len]; + if (!is_numeric_token(next_token.type)) { + log_debug("non-canonical phrase, adding canonical string\n"); + string_tree_add_string(tree, canonical); + last_added_was_whitespace = false; + } else { + log_debug("adding canonical with cstring_array methods\n"); + uint32_t start_index = cstring_array_start_token(tree->strings); + cstring_array_append_string(tree->strings, canonical); + cstring_array_append_string(tree->strings, " "); + last_added_was_whitespace = true; + cstring_array_terminate(tree->strings); + } + } else { + string_tree_add_string(tree, canonical); + last_added_was_whitespace = false; + + } + + if (canonical_normalized != NULL) { + free(canonical_normalized); + } + } else { + log_debug("canonical phrase, adding canonical string\n"); + + uint32_t start_index = cstring_array_start_token(tree->strings); + for (size_t k = phrase.start; k < phrase.start + phrase.len; k++) { + token = tokens->a[k]; + if (token.type != WHITESPACE) { + cstring_array_append_string_len(tree->strings, str + token.offset, token.len); + last_added_was_whitespace = false; + } else { + log_debug("space\n"); + cstring_array_append_string(tree->strings, " "); + last_added_was_whitespace = true; + } + } + cstring_array_terminate(tree->strings); + } + + added_expansions++; + } + + + } + } + + if (added_expansions == 0) { + uint32_t start_index = cstring_array_start_token(tree->strings); + for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { + token = tokens->a[j]; + + if (token.type != WHITESPACE) { + log_debug("Adding canonical token, %.*s\n", (int)token.len, str + token.offset); + cstring_array_append_string_len(tree->strings, str + token.offset, token.len); + last_added_was_whitespace = false; + } else if (!last_added_was_whitespace) { + log_debug("Adding space\n"); + cstring_array_append_string(tree->strings, " "); + last_added_was_whitespace = true; + } + + } + + if (phrase.start + phrase.len < tokens->n - 1) { + token_t next_token = tokens->a[phrase.start + phrase.len + 1]; + if (next_token.type != WHITESPACE && !last_added_was_whitespace && !is_ideographic(next_token.type)) { + cstring_array_append_string(tree->strings, " "); + last_added_was_whitespace = true; + } + } + + cstring_array_terminate(tree->strings); + + } + + log_debug("i=%zu\n", i); + bool end_of_phrase = false; + if (i < phrases->n - 1) { + phrase_t next_phrase = phrases->a[i + 1].phrase; + end_of_phrase = (next_phrase.start != phrase.start || next_phrase.len != phrase.len); + } else { + end_of_phrase = true; + } + + log_debug("end_of_phrase=%d\n", end_of_phrase); + if (end_of_phrase) { + log_debug("finalize at i=%zu\n", i); + string_tree_finalize_token(tree); + } + + start = phrase.start + phrase.len; + prev_phrase = phrase; + + } + + char_array_destroy(key); + + end = (int)tokens->n; + + if (phrase.start + phrase.len > 0 && phrase.start + phrase.len <= end - 1) { + token_t next_token = tokens->a[phrase.start + phrase.len]; + if (next_token.type != WHITESPACE && !last_added_was_whitespace && !is_ideographic(next_token.type)) { + log_debug("space after phrase\n"); + string_tree_add_string(tree, " "); + last_added_was_whitespace = true; + string_tree_finalize_token(tree); + } + } + + + for (size_t j = start; j < end; j++) { + log_debug("On token %zu\n", j); + token_t token = tokens->a[j]; + if (is_punctuation(token.type)) { + log_debug("last_was_punctuation\n"); + last_was_punctuation = true; + continue; + } + + if (token.type != WHITESPACE) { + if (j > 0 && last_was_punctuation && !last_added_was_whitespace) { + log_debug("Adding another space\n"); + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + } + log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); + + bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); + last_added_was_whitespace = false; + } else if (!last_added_was_whitespace) { + log_debug("Adding space IV\n"); + string_tree_add_string(tree, " "); + last_added_was_whitespace = true; + } else { + log_debug("Skipping token %zu\n", j); + continue; + } + + last_was_punctuation = false; + string_tree_finalize_token(tree); + + } + + + } else { + + for (size_t j = 0; j < tokens->n; j++) { + log_debug("On token %zu\n", j); + token_t token = tokens->a[j]; + if (is_punctuation(token.type)) { + log_debug("punctuation, skipping\n"); + last_was_punctuation = true; + continue; + } + + if (token.type != WHITESPACE) { + if (last_was_punctuation && !last_added_was_whitespace) { + log_debug("Adding space V\n"); + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + } + + bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); + last_added_was_whitespace = false; + } else if (!last_added_was_whitespace) { + log_debug("Adding space VI\n"); + string_tree_add_string(tree, " "); + last_added_was_whitespace = true; + } else { + continue; + } + + last_was_punctuation = false; + string_tree_finalize_token(tree); + } + } + + if (phrases != NULL) { + phrase_language_array_destroy(phrases); + } + + token_array_destroy(tokens); + + return tree; +} + + +inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { + size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); + + int32_t unichr = 0; + const uint8_t *ptr = (const uint8_t *)str; + + if (len_ordinal_suffix > 0) { + ssize_t start = 0; + size_t token_offset = token.offset; + size_t token_len = token.len; + + if (len_ordinal_suffix < token.len) { + start = token.offset + token.len - len_ordinal_suffix; + token_offset = token.offset; + token_len = token.len - len_ordinal_suffix; + } else { + start = prev_token.offset + prev_token.len; + token_offset = prev_token.offset; + token_len = prev_token.len; + } + ssize_t prev_char_len = utf8proc_iterate_reversed(ptr, start, &unichr); + if (prev_char_len <= 0) return false; + if (!utf8_is_digit(utf8proc_category(unichr)) && !is_roman_numeral_len(str + token_offset, token_len)) { + return false; + } + } else { + return false; + } + + cstring_array *strings = tree->strings; + // Add the original form first. When this function returns true, + // add_normalized_strings_token won't be called a second time. + add_normalized_strings_token(strings, str, token, options); + + token_t normalized_token = token; + normalized_token.len = token.len - len_ordinal_suffix; + add_normalized_strings_token(strings, str, normalized_token, options); + return true; +} + +inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) { + cstring_array *strings = tree->strings; + + token_t prev_token = (token_t){0, 0, 0}; + + for (size_t i = 0; i < tokens->n; i++) { + token_t token = tokens->a[i]; + bool have_phrase = false; + bool have_ordinal = false; + + if (is_special_token(token.type)) { + string_tree_add_string_len(tree, str + token.offset, token.len); + string_tree_finalize_token(tree); + continue; + } + + for (size_t j = 0; j < options.num_languages; j++) { + char *lang = options.languages[j]; + if (expand_affixes(tree, str, lang, token, options)) { + have_phrase = true; + break; + } + + if (normalize_ordinal_suffixes(tree, str, lang, token, i, prev_token, options)) { + have_ordinal = true; + break; + } + } + + if (!have_phrase && !have_ordinal) { + add_normalized_strings_token(strings, str, token, options); + } + + string_tree_finalize_token(tree); + prev_token = token; + } + +} + + +void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options) { + size_t len = strlen(str); + token_array *tokens = tokenize_keep_whitespace(str); + string_tree_t *token_tree = string_tree_new_size(len); + + add_normalized_strings_tokenized(token_tree, str, tokens, options); + + string_tree_iterator_t *tokenized_iter = string_tree_iterator_new(token_tree); + + string_tree_iterator_t *iter; + + char_array *temp_string = char_array_new_size(len); + + char *token; + + char *lang; + + kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining); + + bool excessive_perms_outer = tokenized_iter->remaining >= EXCESSIVE_PERMUTATIONS; + + if (!excessive_perms_outer) { + kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining); + } + + log_debug("tokenized_iter->remaining=%d\n", tokenized_iter->remaining); + + for (; !string_tree_iterator_done(tokenized_iter); string_tree_iterator_next(tokenized_iter)) { + char_array_clear(temp_string); + + string_tree_iterator_foreach_token(tokenized_iter, token, { + if (token == NULL) { + continue; + } + char_array_append(temp_string, token); + }) + char_array_terminate(temp_string); + + char *tokenized_str = char_array_get_string(temp_string); + + string_tree_t *alternatives; + + int ret; + log_debug("Adding alternatives for single normalization\n"); + alternatives = add_string_alternatives(tokenized_str, options); + + log_debug("num strings = %" PRIu32 "\n", string_tree_num_strings(alternatives)); + + if (alternatives == NULL) { + log_debug("alternatives = NULL\n"); + continue; + } + + iter = string_tree_iterator_new(alternatives); + log_debug("iter->num_tokens=%d\n", iter->num_tokens); + log_debug("iter->remaining=%d\n", iter->remaining); + + bool excessive_perms_inner = iter->remaining >= EXCESSIVE_PERMUTATIONS; + + if (!excessive_perms_inner && !excessive_perms_outer) { + for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) { + char_array_clear(temp_string); + string_tree_iterator_foreach_token(iter, token, { + log_debug("token=%s\n", token); + char_array_append(temp_string, token); + }) + char_array_terminate(temp_string); + + token = char_array_get_string(temp_string); + log_debug("full string=%s\n", token); + khiter_t k = kh_get(str_set, unique_strings, token); + + if (k == kh_end(unique_strings)) { + log_debug("doing postprocessing\n"); + add_postprocessed_string(strings, token, options); + k = kh_put(str_set, unique_strings, strdup(token), &ret); + } + + log_debug("iter->remaining = %d\n", iter->remaining); + + } + } else { + cstring_array_add_string(strings, tokenized_str); + } + + string_tree_iterator_destroy(iter); + string_tree_destroy(alternatives); + + if (excessive_perms_outer) { + break; + } + } + + string_tree_iterator_destroy(tokenized_iter); + string_tree_destroy(token_tree); + + token_array_destroy(tokens); + + char_array_destroy(temp_string); +} + + + +char **expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { + options.address_components |= LIBPOSTAL_ADDRESS_ANY; + + uint64_t normalize_string_options = get_normalize_string_options(options); + + size_t len = strlen(input); + + language_classifier_response_t *lang_response = NULL; + + if (options.num_languages == 0) { + lang_response = classify_languages(input); + if (lang_response != NULL) { + options.num_languages = lang_response->num_languages; + options.languages = lang_response->languages; + } + } + + string_tree_t *tree = normalize_string_languages(input, normalize_string_options, options.num_languages, options.languages); + + cstring_array *strings = cstring_array_new_size(len * 2); + char_array *temp_string = char_array_new_size(len); + + khash_t(str_set) *unique_strings = kh_init(str_set); + + char *token; + + log_debug("string_tree_num_tokens(tree) = %d\n", string_tree_num_tokens(tree)); + + if (string_tree_num_strings(tree) == 1) { + char *normalized = string_tree_get_alternative(tree, 0, 0); + expand_alternative(strings, unique_strings, normalized, options); + + } else { + log_debug("Adding alternatives for multiple normalizations\n"); + string_tree_iterator_t *iter = string_tree_iterator_new(tree); + + for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) { + char *segment; + char_array_clear(temp_string); + bool is_first = true; + + string_tree_iterator_foreach_token(iter, segment, { + if (!is_first) { + char_array_append(temp_string, " "); + } + char_array_append(temp_string, segment); + is_first = false; + }) + char_array_terminate(temp_string); + token = char_array_get_string(temp_string); + log_debug("current permutation = %s\n", token); + expand_alternative(strings, unique_strings, token, options); + } + + string_tree_iterator_destroy(iter); + } + + char *key_str = NULL; + for (size_t i = kh_begin(unique_strings); i != kh_end(unique_strings); ++i) { + if (!kh_exist(unique_strings, i)) continue; + key_str = (char *)kh_key(unique_strings, i); + free(key_str); + } + + kh_destroy(str_set, unique_strings); + + if (lang_response != NULL) { + language_classifier_response_destroy(lang_response); + } + + char_array_destroy(temp_string); + string_tree_destroy(tree); + + *n = cstring_array_num_strings(strings); + + return cstring_array_to_strings(strings); + +} + +void expansion_array_destroy(char **expansions, size_t n) { + for (size_t i = 0; i < n; i++) { + free(expansions[i]); + } + free(expansions); +} + diff --git a/src/expand.h b/src/expand.h new file mode 100644 index 00000000..0e24cae4 --- /dev/null +++ b/src/expand.h @@ -0,0 +1,52 @@ +#ifndef EXPAND_H +#define EXPAND_H + +#include +#include + +#include "libpostal.h" + +#include "address_dictionary.h" +#include "collections.h" +#include "klib/khash.h" +#include "klib/ksort.h" +#include "trie_search.h" + +typedef struct phrase_language { + char *language; + phrase_t phrase; +} phrase_language_t; + +VECTOR_INIT(phrase_language_array, phrase_language_t) + +#define ks_lt_phrase_language(a, b) ((a).phrase.start < (b).phrase.start || ((a).phrase.start == (b).phrase.start && (a).phrase.len > (b).phrase.len)) + +KSORT_INIT(phrase_language_array, phrase_language_t, ks_lt_phrase_language) + +uint64_t get_normalize_token_options(libpostal_normalize_options_t options); +uint64_t get_normalize_string_options(libpostal_normalize_options_t options); + +void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options); +void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options); + +address_expansion_array *valid_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options); + +void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options); +bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options, bool with_period); + +bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options); +bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options); +bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options); + +string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t options); + +bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options); + +void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options); + +void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options); +char **expand_address(char *input, libpostal_normalize_options_t options, size_t *n); +char **expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n); +void expansion_array_destroy(char **expansions, size_t n); + +#endif \ No newline at end of file diff --git a/src/libpostal.c b/src/libpostal.c index 2c0a8521..32d80331 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -8,30 +8,13 @@ #include "address_dictionary.h" #include "address_parser.h" -#include "collections.h" -#include "constants.h" +#include "expand.h" + #include "language_classifier.h" -#include "numex.h" #include "normalize.h" #include "scanner.h" #include "string_utils.h" #include "token_types.h" -#include "transliterate.h" - -typedef struct phrase_language { - char *language; - phrase_t phrase; -} phrase_language_t; - -VECTOR_INIT(phrase_language_array, phrase_language_t) - -#define ks_lt_phrase_language(a, b) ((a).phrase.start < (b).phrase.start || ((a).phrase.start == (b).phrase.start && (a).phrase.len > (b).phrase.len)) - -KSORT_INIT(phrase_language_array, phrase_language_t, ks_lt_phrase_language) - -#define DEFAULT_KEY_LEN 32 - -#define EXCESSIVE_PERMUTATIONS 100 static libpostal_normalize_options_t LIBPOSTAL_DEFAULT_OPTIONS = { .languages = NULL, @@ -61,1089 +44,12 @@ libpostal_normalize_options_t libpostal_get_default_options(void) { return LIBPOSTAL_DEFAULT_OPTIONS; } -static inline uint64_t get_normalize_token_options(libpostal_normalize_options_t options) { - uint64_t normalize_token_options = 0; - - normalize_token_options |= options.delete_final_periods ? NORMALIZE_TOKEN_DELETE_FINAL_PERIOD : 0; - normalize_token_options |= options.delete_acronym_periods ? NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS : 0; - normalize_token_options |= options.drop_english_possessives ? NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES : 0; - normalize_token_options |= options.delete_apostrophes ? NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE : 0; - - return normalize_token_options; -} - -static inline uint64_t get_normalize_string_options(libpostal_normalize_options_t options) { - uint64_t normalize_string_options = 0; - normalize_string_options |= options.transliterate ? NORMALIZE_STRING_TRANSLITERATE : 0; - normalize_string_options |= options.latin_ascii ? NORMALIZE_STRING_LATIN_ASCII : 0; - normalize_string_options |= options.decompose ? NORMALIZE_STRING_DECOMPOSE : 0; - normalize_string_options |= options.strip_accents ? NORMALIZE_STRING_STRIP_ACCENTS : 0; - normalize_string_options |= options.lowercase ? NORMALIZE_STRING_LOWERCASE : 0; - normalize_string_options |= options.trim_string ? NORMALIZE_STRING_TRIM : 0; - normalize_string_options |= options.expand_numex ? NORMALIZE_STRING_REPLACE_NUMEX : 0; - - return normalize_string_options; -} - - -static inline size_t string_hyphen_prefix_len(char *str, size_t len) { - // Strip beginning hyphens - int32_t unichr; - uint8_t *ptr = (uint8_t *)str; - ssize_t char_len = utf8proc_iterate(ptr, len, &unichr); - if (utf8_is_hyphen(unichr)) { - return (size_t)char_len; - } - return 0; -} - -static inline size_t string_hyphen_suffix_len(char *str, size_t len) { - // Strip beginning hyphens - int32_t unichr; - uint8_t *ptr = (uint8_t *)str; - ssize_t char_len = utf8proc_iterate_reversed(ptr, len, &unichr); - if (utf8_is_hyphen(unichr)) { - return (size_t)char_len; - } - return 0; -} - -static void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) { - - uint64_t normalize_token_options = get_normalize_token_options(options); - - if (token.type != WHITESPACE ) { - - bool contains_hyphen = string_contains_hyphen_len(str + token.offset, token.len); - - if (!contains_hyphen || token.type == HYPHEN) { - log_debug("str = %s, token = {%zu, %zu, %u}\n", str, token.offset, token.len, token.type); - normalize_token(strings, str, token, normalize_token_options); - } else if (is_word_token(token.type)) { - - size_t prefix_hyphen_len = string_hyphen_prefix_len(str + token.offset, token.len); - if (prefix_hyphen_len > 0) { - token.offset += prefix_hyphen_len; - } - - size_t suffix_hyphen_len = string_hyphen_suffix_len(str + token.offset, token.len); - if (suffix_hyphen_len > 0) { - token.len -= suffix_hyphen_len; - } - - normalize_token(strings, str, token, normalize_token_options); - - if (options.replace_word_hyphens) { - normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS; - normalize_token(strings, str, token, normalize_token_options); - normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS; - } - - if (options.delete_word_hyphens) { - normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS; - normalize_token(strings, str, token, normalize_token_options); - normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS; - } - - } else if (is_numeric_token(token.type)) { - normalize_token(strings, str, token, normalize_token_options); - - if (options.replace_word_hyphens || options.replace_numeric_hyphens) { - if (options.replace_word_hyphens) { - normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS; - } - - if (options.replace_numeric_hyphens) { - normalize_token_options |= NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS; - } - - normalize_token(strings, str, token, normalize_token_options); - normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS | NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS; - } - - if (options.delete_numeric_hyphens) { - normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS; - normalize_token(strings, str, token, normalize_token_options); - normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS; - } - } - - if (is_numeric_token(token.type) && options.split_alpha_from_numeric && numeric_starts_with_alpha(str, token)) { - normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; - normalize_token(strings, str, token, normalize_token_options); - normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; - } - } else { - cstring_array_add_string(strings, " "); - } -} - -static void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options) { - cstring_array_add_string(strings, str); - - if (options.roman_numerals) { - char *numex_replaced = replace_numeric_expressions(str, LATIN_LANGUAGE_CODE); - if (numex_replaced != NULL) { - cstring_array_add_string(strings, numex_replaced); - free(numex_replaced); - } - - } - -} - - - -static address_expansion_array *get_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options) { - uint32_t expansion_index = phrase.data; - address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); - if (value != NULL && value->components & options.address_components) { - return value->expansions; - } - - return NULL; -} - -static inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options) { - if (expansion.canonical_index != NULL_CANONICAL_INDEX) { - char *canonical = address_dictionary_get_canonical(expansion.canonical_index); - uint64_t normalize_string_options = get_normalize_string_options(options); - char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options); - canonical = canonical_normalized != NULL ? canonical_normalized : canonical; - - char_array_cat(key, canonical); - if (canonical_normalized != NULL) { - free(canonical_normalized); - } - } else { - char_array_cat_len(key, str + token.offset + phrase.start, phrase.len); - } -} - - -static bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options, bool with_period) { - cstring_array *strings = tree->strings; - - size_t skip_period = with_period ? 1 : 0; - - bool have_suffix = suffix.len > 0 && suffix.len < token.len; - bool have_prefix = prefix.len > 0 && prefix.len + with_period < token.len; - - if (!have_suffix && !have_prefix) { - return false; - } - - address_expansion_array *prefix_expansions = NULL; - address_expansion_array *suffix_expansions = NULL; - - address_expansion_t prefix_expansion; - address_expansion_t suffix_expansion; - - char *expansion; - - size_t num_strings = 0; - char *root_word = NULL; - size_t root_len; - token_t root_token; - cstring_array *root_strings = NULL; - int add_space = 0; - int spaces = 0; - - size_t prefix_start, prefix_end, root_end, suffix_start; - - if (have_prefix) { - prefix_expansions = get_affix_expansions(prefix, options); - if (prefix_expansions == NULL) have_prefix = false; - } - - if (have_suffix) { - suffix_expansions = get_affix_expansions(suffix, options); - if (suffix_expansions == NULL) have_suffix = false; - } - - if (!have_suffix && !have_prefix) { - return false; - } - - char_array *key = char_array_new_size(token.len); - - if (have_prefix && have_suffix) { - for (size_t i = 0; i < prefix_expansions->n; i++) { - prefix_expansion = prefix_expansions->a[i]; - char_array_clear(key); - - cat_affix_expansion(key, str, prefix_expansion, token, prefix, options); - prefix_start = key->n - 1; - - add_space = (int)prefix_expansion.separable || with_period; - if (prefix.len + skip_period + suffix.len < token.len && !prefix_expansion.separable) { - add_space = suffix_expansion.separable || with_period; - } - - for (spaces = skip_period; spaces <= add_space; spaces++) { - key->n = prefix_start; - if (spaces) { - char_array_cat(key, " "); - } - - prefix_end = key->n; - - if (prefix.len + skip_period + suffix.len < token.len) { - root_len = token.len - suffix.len - prefix.len - skip_period; - size_t root_start = token.offset + prefix.len + skip_period; - size_t prefix_hyphen_len = string_hyphen_prefix_len(str + root_start, root_len); - root_start += prefix_hyphen_len; - root_len -= prefix_hyphen_len; - size_t suffix_hyphen_len = string_hyphen_suffix_len(str + root_start, root_len); - root_len -= suffix_hyphen_len; - root_token = (token_t){root_start, root_len, token.type}; - root_strings = cstring_array_new_size(root_len); - add_normalized_strings_token(root_strings, str, root_token, options); - num_strings = cstring_array_num_strings(root_strings); - - for (size_t j = 0; j < num_strings; j++) { - key->n = prefix_end; - root_word = cstring_array_get_string(root_strings, j); - char_array_cat(key, root_word); - root_end = key->n - 1; - - for (size_t k = 0; k < suffix_expansions->n; k++) { - key->n = root_end; - suffix_expansion = suffix_expansions->a[k]; - - int add_suffix_space = suffix_expansion.separable; - - suffix_start = key->n; - for (int suffix_spaces = skip_period; suffix_spaces <= add_suffix_space; suffix_spaces++) { - key->n = suffix_start; - if (suffix_spaces) { - char_array_cat(key, " "); - } - - cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); - - expansion = char_array_get_string(key); - cstring_array_add_string(strings, expansion); - - } - - - } - } - - cstring_array_destroy(root_strings); - root_strings = NULL; - - } else { - for (size_t j = 0; j < suffix_expansions->n; j++) { - key->n = prefix_end - skip_period; - suffix_expansion = suffix_expansions->a[j]; - - cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); - - expansion = char_array_get_string(key); - cstring_array_add_string(tree->strings, expansion); - } - } - } - - } - } else if (have_suffix) { - log_debug("suffix.start=%" PRId32 "\n", suffix.start); - root_len = suffix.start; - root_token = (token_t){token.offset, root_len, token.type}; - log_debug("root_len=%zu\n", root_len); - log_debug("root_token = {%zu, %zu, %u}\n", root_token.offset, root_token.len, root_token.type); - - root_strings = cstring_array_new_size(root_len + 1); - add_normalized_strings_token(root_strings, str, root_token, options); - num_strings = cstring_array_num_strings(root_strings); - - log_debug("num_strings = %zu\n", num_strings); - - for (size_t j = 0; j < num_strings; j++) { - char_array_clear(key); - root_word = cstring_array_get_string(root_strings, j); - log_debug("root_word=%s\n", root_word); - char_array_cat(key, root_word); - root_end = key->n - 1; - - for (size_t k = 0; k < suffix_expansions->n; k++) { - key->n = root_end; - suffix_expansion = suffix_expansions->a[k]; - - add_space = (suffix_expansion.separable || with_period) && suffix.len < token.len; - suffix_start = key->n; - - for (int spaces = skip_period; spaces <= add_space; spaces++) { - key->n = suffix_start; - if (spaces) { - char_array_cat(key, " "); - } - - cat_affix_expansion(key, str, suffix_expansion, token, suffix, options); - - expansion = char_array_get_string(key); - cstring_array_add_string(tree->strings, expansion); - } - } - } - } else if (have_prefix) { - if (prefix.len + skip_period <= token.len) { - root_len = token.len - prefix.len - skip_period; - size_t root_start = token.offset + prefix.len + skip_period; - size_t prefix_hyphen_len = string_hyphen_prefix_len(str + root_start, root_len); - root_start += prefix_hyphen_len; - root_len -= prefix_hyphen_len; - size_t suffix_hyphen_len = string_hyphen_suffix_len(str + root_start, root_len); - root_len -= suffix_hyphen_len; - root_token = (token_t){root_start, root_len, token.type}; - root_strings = cstring_array_new_size(root_len); - add_normalized_strings_token(root_strings, str, root_token, options); - num_strings = cstring_array_num_strings(root_strings); - - } else { - root_strings = cstring_array_new_size(token.len); - add_normalized_strings_token(root_strings, str, token, options); - num_strings = cstring_array_num_strings(root_strings); - - for (size_t k = 0; k < num_strings; k++) { - root_word = cstring_array_get_string(root_strings, k); - cstring_array_add_string(tree->strings, root_word); - } - - char_array_destroy(key); - cstring_array_destroy(root_strings); - return false; - - } - - for (size_t j = 0; j < prefix_expansions->n; j++) { - char_array_clear(key); - prefix_expansion = prefix_expansions->a[j]; - - cat_affix_expansion(key, str, prefix_expansion, token, prefix, options); - prefix_end = key->n - 1; - - add_space = (prefix_expansion.separable || with_period) && prefix.len + skip_period < token.len; - for (int spaces = skip_period; spaces <= add_space; spaces++) { - key->n = prefix_end; - if (spaces) { - char_array_cat(key, " "); - } - size_t prefix_space_len = key->n - spaces; - for (size_t k = 0; k < num_strings; k++) { - key->n = prefix_space_len; - root_word = cstring_array_get_string(root_strings, k); - char_array_cat(key, root_word); - - expansion = char_array_get_string(key); - cstring_array_add_string(tree->strings, expansion); - } - - } - } - } - - char_array_destroy(key); - - if (root_strings != NULL) { - cstring_array_destroy(root_strings); - } - - return true; - -} - -static inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { - phrase_t suffix = search_address_dictionaries_suffix(str + token.offset, token.len, lang); - - phrase_t prefix = search_address_dictionaries_prefix(str + token.offset, token.len, lang); - - if ((suffix.len == 0 && prefix.len == 0)) return false; - - bool with_period = false; - - return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period); -} - -static inline bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) { - ssize_t first_period_index = string_next_period_len(str + token.offset, token.len); - if (first_period_index > 0) { - ssize_t next_period_index = string_next_period_len(str + token.offset + first_period_index + 1, token.len - first_period_index - 1); - // Token contains only one period or one + a final period - if (next_period_index < 0 || next_period_index == token.len - 1) { - phrase_t prefix = search_address_dictionaries_substring(str + token.offset, first_period_index, lang); - - phrase_t suffix = search_address_dictionaries_substring(str + token.offset + first_period_index + 1, token.len - first_period_index - 1, lang); - if (suffix.len > 0) { - suffix.start = first_period_index + 1; - } - - if (suffix.len == 0 && prefix.len == 0) return false; - - bool with_period = true; - - return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period); - } else { - return false; - } - } else { - return false; - } -} - -static bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options) { - bool have_period_affixes = false; - if (string_contains_period_len(str + token.offset, token.len)) { - for (size_t l = 0; l < options.num_languages; l++) { - char *lang = options.languages[l]; - if (expand_affixes_period(tree, str, lang, token, options)) { - have_period_affixes = true; - break; - } - } - } - - if (!have_period_affixes) { - string_tree_add_string_len(tree, str + token.offset, token.len); - } - - return have_period_affixes; -} - - -static string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t options) { - char_array *key = NULL; - - log_debug("input=%s\n", str); - token_array *tokens = tokenize_keep_whitespace(str); - - if (tokens == NULL) { - return NULL; - } - - size_t len = strlen(str); - - log_debug("tokenized, num tokens=%zu\n", tokens->n); - - bool last_was_punctuation = false; - - phrase_language_array *phrases = NULL; - phrase_array *lang_phrases = NULL; - - - for (size_t i = 0; i < options.num_languages; i++) { - char *lang = options.languages[i]; - log_debug("lang=%s\n", lang); - - lang_phrases = search_address_dictionaries_tokens(str, tokens, lang); - - if (lang_phrases == NULL) { - log_debug("lang_phrases NULL\n"); - continue; - } - - log_debug("lang_phrases->n = %zu\n", lang_phrases->n); - - phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); - - for (size_t j = 0; j < lang_phrases->n; j++) { - phrase_t p = lang_phrases->a[j]; - log_debug("lang=%s, (%d, %d)\n", lang, p.start, p.len); - phrase_language_array_push(phrases, (phrase_language_t){lang, p}); - } - - phrase_array_destroy(lang_phrases); - } - - - lang_phrases = search_address_dictionaries_tokens(str, tokens, ALL_LANGUAGES); - if (lang_phrases != NULL) { - phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); - - for (size_t j = 0; j < lang_phrases->n; j++) { - phrase_t p = lang_phrases->a[j]; - phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p}); - } - phrase_array_destroy(lang_phrases); - - } - - string_tree_t *tree = string_tree_new_size(len); - - bool last_added_was_whitespace = false; - - uint64_t normalize_string_options = get_normalize_string_options(options); - - if (phrases != NULL) { - log_debug("phrases not NULL, n=%zu\n", phrases->n); - ks_introsort(phrase_language_array, phrases->n, phrases->a); - - phrase_language_t phrase_lang; - - size_t start = 0; - size_t end = 0; - - phrase_t phrase = NULL_PHRASE; - phrase_t prev_phrase = NULL_PHRASE; - - key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN); - - for (size_t i = 0; i < phrases->n; i++) { - phrase_lang = phrases->a[i]; - - phrase = phrase_lang.phrase; - - log_debug("phrase.start=%d, phrase.len=%d, lang=%s, prev_phrase.start=%d, prev_phrase.len=%d\n", phrase.start, phrase.len, phrase_lang.language, prev_phrase.start, prev_phrase.len); - - if ((phrase.start > prev_phrase.start && phrase.start < prev_phrase.start + prev_phrase.len) || (phrase.start == prev_phrase.start && i > 0 && phrase.len < prev_phrase.len)) { - log_debug("continuing\n"); - continue; - } - - char_array_clear(key); - - char_array_cat(key, phrase_lang.language); - char_array_cat(key, NAMESPACE_SEPARATOR_CHAR); - - size_t namespace_len = key->n; - - end = phrase.start; - - log_debug("start=%zu, end=%zu\n", start, end); - for (size_t j = start; j < end; j++) { - log_debug("Adding token %zu\n", j); - token_t token = tokens->a[j]; - if (is_punctuation(token.type)) { - last_was_punctuation = true; - continue; - } - - if (token.type != WHITESPACE) { - if (phrase.start > 0 && last_was_punctuation && !last_added_was_whitespace) { - string_tree_add_string(tree, " "); - string_tree_finalize_token(tree); - } - log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); - - bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); - last_added_was_whitespace = false; - } else if (!last_added_was_whitespace) { - log_debug("Adding pre-phrase whitespace\n"); - last_added_was_whitespace = true; - string_tree_add_string(tree, " "); - } else { - continue; - } - - last_was_punctuation = false; - string_tree_finalize_token(tree); - } - - if (phrase.start > 0 && start < end) { - token_t prev_token = tokens->a[phrase.start - 1]; - log_debug("last_added_was_whitespace=%d\n", last_added_was_whitespace); - if (!last_added_was_whitespace && phrase.start - 1 > 0 && (!is_ideographic(prev_token.type) || last_was_punctuation)) { - log_debug("Adding space III\n"); - string_tree_add_string(tree, " "); - last_added_was_whitespace = true; - string_tree_finalize_token(tree); - } - } - - uint32_t expansion_index = phrase.data; - address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); - - token_t token; - - size_t added_expansions = 0; - if ((value->components & options.address_components) > 0) { - key->n = namespace_len; - for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { - token = tokens->a[j]; - if (token.type != WHITESPACE) { - char_array_cat_len(key, str + token.offset, token.len); - last_added_was_whitespace = false; - } else { - char_array_cat(key, " "); - last_added_was_whitespace = true; - } - } - - char *key_str = char_array_get_string(key); - log_debug("key_str=%s\n", key_str); - address_expansion_array *expansions = value->expansions; - - if (expansions != NULL) { - for (size_t j = 0; j < expansions->n; j++) { - address_expansion_t expansion = expansions->a[j]; - - if ((expansion.address_components & options.address_components) == 0 && !address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION)) { - continue; - } - - if (expansion.canonical_index != NULL_CANONICAL_INDEX) { - char *canonical = address_dictionary_get_canonical(expansion.canonical_index); - char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options); - - canonical = canonical_normalized != NULL ? canonical_normalized : canonical; - - - if (phrase.start + phrase.len < tokens->n - 1) { - token_t next_token = tokens->a[phrase.start + phrase.len]; - if (!is_numeric_token(next_token.type)) { - log_debug("non-canonical phrase, adding canonical string\n"); - string_tree_add_string(tree, canonical); - last_added_was_whitespace = false; - } else { - log_debug("adding canonical with cstring_array methods\n"); - uint32_t start_index = cstring_array_start_token(tree->strings); - cstring_array_append_string(tree->strings, canonical); - cstring_array_append_string(tree->strings, " "); - last_added_was_whitespace = true; - cstring_array_terminate(tree->strings); - } - } else { - string_tree_add_string(tree, canonical); - last_added_was_whitespace = false; - - } - - if (canonical_normalized != NULL) { - free(canonical_normalized); - } - } else { - log_debug("canonical phrase, adding canonical string\n"); - - uint32_t start_index = cstring_array_start_token(tree->strings); - for (size_t k = phrase.start; k < phrase.start + phrase.len; k++) { - token = tokens->a[k]; - if (token.type != WHITESPACE) { - cstring_array_append_string_len(tree->strings, str + token.offset, token.len); - last_added_was_whitespace = false; - } else { - log_debug("space\n"); - cstring_array_append_string(tree->strings, " "); - last_added_was_whitespace = true; - } - } - cstring_array_terminate(tree->strings); - } - - added_expansions++; - } - - - } - } - - if (added_expansions == 0) { - uint32_t start_index = cstring_array_start_token(tree->strings); - for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { - token = tokens->a[j]; - - if (token.type != WHITESPACE) { - log_debug("Adding canonical token, %.*s\n", (int)token.len, str + token.offset); - cstring_array_append_string_len(tree->strings, str + token.offset, token.len); - last_added_was_whitespace = false; - } else if (!last_added_was_whitespace) { - log_debug("Adding space\n"); - cstring_array_append_string(tree->strings, " "); - last_added_was_whitespace = true; - } - - } - - if (phrase.start + phrase.len < tokens->n - 1) { - token_t next_token = tokens->a[phrase.start + phrase.len + 1]; - if (next_token.type != WHITESPACE && !last_added_was_whitespace && !is_ideographic(next_token.type)) { - cstring_array_append_string(tree->strings, " "); - last_added_was_whitespace = true; - } - } - - cstring_array_terminate(tree->strings); - - } - - log_debug("i=%zu\n", i); - bool end_of_phrase = false; - if (i < phrases->n - 1) { - phrase_t next_phrase = phrases->a[i + 1].phrase; - end_of_phrase = (next_phrase.start != phrase.start || next_phrase.len != phrase.len); - } else { - end_of_phrase = true; - } - - log_debug("end_of_phrase=%d\n", end_of_phrase); - if (end_of_phrase) { - log_debug("finalize at i=%zu\n", i); - string_tree_finalize_token(tree); - } - - start = phrase.start + phrase.len; - prev_phrase = phrase; - - } - - char_array_destroy(key); - - end = (int)tokens->n; - - if (phrase.start + phrase.len > 0 && phrase.start + phrase.len <= end - 1) { - token_t next_token = tokens->a[phrase.start + phrase.len]; - if (next_token.type != WHITESPACE && !last_added_was_whitespace && !is_ideographic(next_token.type)) { - log_debug("space after phrase\n"); - string_tree_add_string(tree, " "); - last_added_was_whitespace = true; - string_tree_finalize_token(tree); - } - } - - - for (size_t j = start; j < end; j++) { - log_debug("On token %zu\n", j); - token_t token = tokens->a[j]; - if (is_punctuation(token.type)) { - log_debug("last_was_punctuation\n"); - last_was_punctuation = true; - continue; - } - - if (token.type != WHITESPACE) { - if (j > 0 && last_was_punctuation && !last_added_was_whitespace) { - log_debug("Adding another space\n"); - string_tree_add_string(tree, " "); - string_tree_finalize_token(tree); - } - log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); - - bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); - last_added_was_whitespace = false; - } else if (!last_added_was_whitespace) { - log_debug("Adding space IV\n"); - string_tree_add_string(tree, " "); - last_added_was_whitespace = true; - } else { - log_debug("Skipping token %zu\n", j); - continue; - } - - last_was_punctuation = false; - string_tree_finalize_token(tree); - - } - - - } else { - - for (size_t j = 0; j < tokens->n; j++) { - log_debug("On token %zu\n", j); - token_t token = tokens->a[j]; - if (is_punctuation(token.type)) { - log_debug("punctuation, skipping\n"); - last_was_punctuation = true; - continue; - } - - if (token.type != WHITESPACE) { - if (last_was_punctuation && !last_added_was_whitespace) { - log_debug("Adding space V\n"); - string_tree_add_string(tree, " "); - string_tree_finalize_token(tree); - } - - bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); - last_added_was_whitespace = false; - } else if (!last_added_was_whitespace) { - log_debug("Adding space VI\n"); - string_tree_add_string(tree, " "); - last_added_was_whitespace = true; - } else { - continue; - } - - last_was_punctuation = false; - string_tree_finalize_token(tree); - } - } - - if (phrases != NULL) { - phrase_language_array_destroy(phrases); - } - - token_array_destroy(tokens); - - return tree; -} - - -static inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { - size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); - - int32_t unichr = 0; - const uint8_t *ptr = (const uint8_t *)str; - - if (len_ordinal_suffix > 0) { - ssize_t start = 0; - size_t token_offset = token.offset; - size_t token_len = token.len; - - if (len_ordinal_suffix < token.len) { - start = token.offset + token.len - len_ordinal_suffix; - token_offset = token.offset; - token_len = token.len - len_ordinal_suffix; - } else { - start = prev_token.offset + prev_token.len; - token_offset = prev_token.offset; - token_len = prev_token.len; - } - ssize_t prev_char_len = utf8proc_iterate_reversed(ptr, start, &unichr); - if (prev_char_len <= 0) return false; - if (!utf8_is_digit(utf8proc_category(unichr)) && !is_roman_numeral_len(str + token_offset, token_len)) { - return false; - } - } else { - return false; - } - - cstring_array *strings = tree->strings; - // Add the original form first. When this function returns true, - // add_normalized_strings_token won't be called a second time. - add_normalized_strings_token(strings, str, token, options); - - token_t normalized_token = token; - normalized_token.len = token.len - len_ordinal_suffix; - add_normalized_strings_token(strings, str, normalized_token, options); - return true; -} - -static inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) { - cstring_array *strings = tree->strings; - - token_t prev_token = (token_t){0, 0, 0}; - - for (size_t i = 0; i < tokens->n; i++) { - token_t token = tokens->a[i]; - bool have_phrase = false; - bool have_ordinal = false; - - if (is_special_token(token.type)) { - string_tree_add_string_len(tree, str + token.offset, token.len); - string_tree_finalize_token(tree); - continue; - } - - for (size_t j = 0; j < options.num_languages; j++) { - char *lang = options.languages[j]; - if (expand_affixes(tree, str, lang, token, options)) { - have_phrase = true; - break; - } - - if (normalize_ordinal_suffixes(tree, str, lang, token, i, prev_token, options)) { - have_ordinal = true; - break; - } - } - - if (!have_phrase && !have_ordinal) { - add_normalized_strings_token(strings, str, token, options); - } - - string_tree_finalize_token(tree); - prev_token = token; - } - -} - - -static void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options) { - size_t len = strlen(str); - token_array *tokens = tokenize_keep_whitespace(str); - string_tree_t *token_tree = string_tree_new_size(len); - - add_normalized_strings_tokenized(token_tree, str, tokens, options); - - string_tree_iterator_t *tokenized_iter = string_tree_iterator_new(token_tree); - - string_tree_iterator_t *iter; - - char_array *temp_string = char_array_new_size(len); - - char *token; - - char *lang; - - kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining); - - bool excessive_perms_outer = tokenized_iter->remaining >= EXCESSIVE_PERMUTATIONS; - - if (!excessive_perms_outer) { - kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining); - } - - log_debug("tokenized_iter->remaining=%d\n", tokenized_iter->remaining); - - for (; !string_tree_iterator_done(tokenized_iter); string_tree_iterator_next(tokenized_iter)) { - char_array_clear(temp_string); - - string_tree_iterator_foreach_token(tokenized_iter, token, { - if (token == NULL) { - continue; - } - char_array_append(temp_string, token); - }) - char_array_terminate(temp_string); - - char *tokenized_str = char_array_get_string(temp_string); - - string_tree_t *alternatives; - - int ret; - log_debug("Adding alternatives for single normalization\n"); - alternatives = add_string_alternatives(tokenized_str, options); - - log_debug("num strings = %" PRIu32 "\n", string_tree_num_strings(alternatives)); - - if (alternatives == NULL) { - log_debug("alternatives = NULL\n"); - continue; - } - - iter = string_tree_iterator_new(alternatives); - log_debug("iter->num_tokens=%d\n", iter->num_tokens); - log_debug("iter->remaining=%d\n", iter->remaining); - - bool excessive_perms_inner = iter->remaining >= EXCESSIVE_PERMUTATIONS; - - if (!excessive_perms_inner && !excessive_perms_outer) { - for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) { - char_array_clear(temp_string); - string_tree_iterator_foreach_token(iter, token, { - log_debug("token=%s\n", token); - char_array_append(temp_string, token); - }) - char_array_terminate(temp_string); - - token = char_array_get_string(temp_string); - log_debug("full string=%s\n", token); - khiter_t k = kh_get(str_set, unique_strings, token); - - if (k == kh_end(unique_strings)) { - log_debug("doing postprocessing\n"); - add_postprocessed_string(strings, token, options); - k = kh_put(str_set, unique_strings, strdup(token), &ret); - } - - log_debug("iter->remaining = %d\n", iter->remaining); - - } - } else { - cstring_array_add_string(strings, tokenized_str); - } - - string_tree_iterator_destroy(iter); - string_tree_destroy(alternatives); - - if (excessive_perms_outer) { - break; - } - } - - string_tree_iterator_destroy(tokenized_iter); - string_tree_destroy(token_tree); - - token_array_destroy(tokens); - - char_array_destroy(temp_string); -} - char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { - options.address_components |= LIBPOSTAL_ADDRESS_ANY; - - uint64_t normalize_string_options = get_normalize_string_options(options); - - size_t len = strlen(input); - - language_classifier_response_t *lang_response = NULL; - - if (options.num_languages == 0) { - lang_response = classify_languages(input); - if (lang_response != NULL) { - options.num_languages = lang_response->num_languages; - options.languages = lang_response->languages; - } - } - - string_tree_t *tree = normalize_string_languages(input, normalize_string_options, options.num_languages, options.languages); - - cstring_array *strings = cstring_array_new_size(len * 2); - char_array *temp_string = char_array_new_size(len); - - khash_t(str_set) *unique_strings = kh_init(str_set); - - char *token; - - log_debug("string_tree_num_tokens(tree) = %d\n", string_tree_num_tokens(tree)); - - if (string_tree_num_strings(tree) == 1) { - char *normalized = string_tree_get_alternative(tree, 0, 0); - expand_alternative(strings, unique_strings, normalized, options); - - } else { - log_debug("Adding alternatives for multiple normalizations\n"); - string_tree_iterator_t *iter = string_tree_iterator_new(tree); - - for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) { - char *segment; - char_array_clear(temp_string); - bool is_first = true; - - string_tree_iterator_foreach_token(iter, segment, { - if (!is_first) { - char_array_append(temp_string, " "); - } - char_array_append(temp_string, segment); - is_first = false; - }) - char_array_terminate(temp_string); - token = char_array_get_string(temp_string); - log_debug("current permutation = %s\n", token); - expand_alternative(strings, unique_strings, token, options); - } - - string_tree_iterator_destroy(iter); - } - - char *key_str = NULL; - for (size_t i = kh_begin(unique_strings); i != kh_end(unique_strings); ++i) { - if (!kh_exist(unique_strings, i)) continue; - key_str = (char *)kh_key(unique_strings, i); - free(key_str); - } - - kh_destroy(str_set, unique_strings); - - if (lang_response != NULL) { - language_classifier_response_destroy(lang_response); - } - - char_array_destroy(temp_string); - string_tree_destroy(tree); - - *n = cstring_array_num_strings(strings); - - return cstring_array_to_strings(strings); - + return expand_address(input, options, n); } void libpostal_expansion_array_destroy(char **expansions, size_t n) { - for (size_t i = 0; i < n; i++) { - free(expansions[i]); - } - free(expansions); + expansion_array_destroy(expansions, n); } void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) { @@ -1184,7 +90,6 @@ libpostal_address_parser_response_t *libpostal_parse_address(char *address, libp if (parsed == NULL) { log_error("Parser returned NULL\n"); - libpostal_address_parser_response_destroy(parsed); return NULL; } From 272ee3b965255d63799443cecdfb6011300b0019 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 03:11:46 -0500 Subject: [PATCH 32/89] [dictionaries] adding a few more ambiguous expansions in English --- .../dictionaries/en/ambiguous_expansions.txt | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/resources/dictionaries/en/ambiguous_expansions.txt b/resources/dictionaries/en/ambiguous_expansions.txt index a4de4500..b12bd311 100644 --- a/resources/dictionaries/en/ambiguous_expansions.txt +++ b/resources/dictionaries/en/ambiguous_expansions.txt @@ -14,11 +14,13 @@ co ct de dc +d e f fl g ga +h hi i id @@ -27,10 +29,12 @@ in ia j jbt +k ks ky l la +m ma me mb @@ -40,7 +44,6 @@ mn ms mo mt -m n nb nc @@ -59,13 +62,16 @@ nw nwt nv ny +o oh on ok or +p pa pe pei +q qc qld r @@ -77,11 +83,14 @@ sd se sk sw +t tas tn tx +u ut un +v vic vt va @@ -89,5 +98,8 @@ w wa wv wi +wy +x +y yt -wy \ No newline at end of file +z \ No newline at end of file From d0364ab6fbe81573749016aa60acb41c1e0e740b Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 03:14:00 -0500 Subject: [PATCH 33/89] [expand] adding method for checking phrase is in multiple dictionaries, and a helper method for determining whether an address phrase has a canonical interpretation --- src/address_dictionary.c | 41 ++++++++++++++++++++++++++++++++++++++++ src/address_dictionary.h | 3 +++ 2 files changed, 44 insertions(+) diff --git a/src/address_dictionary.c b/src/address_dictionary.c index 957306b8..9a1b328f 100644 --- a/src/address_dictionary.c +++ b/src/address_dictionary.c @@ -1,5 +1,6 @@ #include #include +#include #include "address_dictionary.h" @@ -54,6 +55,20 @@ bool address_phrase_in_dictionary(phrase_t phrase, uint16_t dictionary_id) { } +bool address_phrase_in_dictionaries(phrase_t phrase, size_t n, ...) { + va_list args; + va_start(args, n); + bool in_dictionary = false; + for (size_t i = 0; i < n; i++) { + uint16_t dictionary_id = va_arg(args, uint16_t); + in_dictionary = address_phrase_in_dictionary(phrase, dictionary_id); + if (in_dictionary) break; + } + va_end(args); + return in_dictionary; +} + + int32_t address_dictionary_next_canonical_index(void) { if (address_dict == NULL || address_dict->canonical == NULL) { log_error(ADDRESS_DICTIONARY_SETUP_ERROR); @@ -81,6 +96,32 @@ char *address_dictionary_get_canonical(uint32_t index) { return cstring_array_get_string(address_dict->canonical, index); } +inline bool address_expansions_have_canonical_interpretation(address_expansion_array *expansions) { + if (expansions == NULL) return false; + + address_expansion_t *expansions_array = expansions->a; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions_array[i]; + if (expansion.canonical_index == NULL_CANONICAL_INDEX) { + return true; + } + } + return false; + +} + +inline bool address_phrase_has_canonical_interpretation(phrase_t phrase) { + address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + + return address_expansions_have_canonical_interpretation(expansions); +} + + + address_expansion_value_t *address_expansion_value_new(void) { address_expansion_value_t *self = malloc(sizeof(address_expansion_value_t)); diff --git a/src/address_dictionary.h b/src/address_dictionary.h index 0ee7934f..bb000fb2 100644 --- a/src/address_dictionary.h +++ b/src/address_dictionary.h @@ -70,10 +70,13 @@ phrase_t search_address_dictionaries_suffix(char *str, size_t len, char *lang); address_expansion_value_t *address_dictionary_get_expansions(uint32_t i); bool address_expansion_in_dictionary(address_expansion_t expansion, uint16_t dictionary_id); bool address_phrase_in_dictionary(phrase_t phrase, uint16_t dictionary_id); +bool address_phrase_in_dictionaries(phrase_t phrase, size_t n, ...); char *address_dictionary_get_canonical(uint32_t index); int32_t address_dictionary_next_canonical_index(void); bool address_dictionary_add_canonical(char *canonical); bool address_dictionary_add_expansion(char *key, char *language, address_expansion_t expansion); +bool address_expansions_have_canonical_interpretation(address_expansion_array *expansions); +bool address_phrase_has_canonical_interpretation(phrase_t phrase); void address_dictionary_destroy(address_dictionary_t *self); From 3f7abd5b24f965ebeed7eec143f3ccacd58a525b Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 15:48:07 -0500 Subject: [PATCH 34/89] [expand] adding a method that allows hash/equality comparisons of addresses like "100 Main" with "100 S Main St." or units like "Apt 101" vs. "#101". Instead of expanding the phrase abbreviations, this version tries its best to delete all but the root words in a string for a specific component. It's probably not perfect, but does handle a number of edge cases related to pre/post directionals in English e.g. "E St" will have a root word of simply "E", "Avenue E" => "E", etc. Also handles a variety of cases where the phrase could be a thoroughfare type but is really a root word such as "Park Pl" or the famous "Avenue Rd". This can be used for near dupe hashing to catch possible dupes for later analysis. Note that it will normalize "St Marks Pl" and "St Marks Ave" to the same thing, which is sometimes warranted (if the user typed the wrong thoroughfare), but can also be reconciled at deduping time. --- src/expand.c | 584 ++++++++++++++++++++++++++++++++++++++++++++------- src/expand.h | 12 +- 2 files changed, 518 insertions(+), 78 deletions(-) diff --git a/src/expand.c b/src/expand.c index 709c35ac..fc9cf572 100644 --- a/src/expand.c +++ b/src/expand.c @@ -449,31 +449,235 @@ bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, } -string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t options) { +inline uint32_t gazetter_ignorable_components(uint16_t dictionary_id) { + switch (dictionary_id) { + case DICTIONARY_ACADEMIC_DEGREE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_BUILDING_TYPE: + return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_UNIT; + case DICTIONARY_COMPANY_TYPE: + return LIBPOSTAL_ADDRESS_NAME; + case DICTIONARY_DIRECTIONAL: + return LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_ELISION: + return LIBPOSTAL_ADDRESS_ANY; + case DICTIONARY_ENTRANCE: + return LIBPOSTAL_ADDRESS_ENTRANCE; + case DICTIONARY_HOUSE_NUMBER: + return LIBPOSTAL_ADDRESS_HOUSE_NUMBER; + case DICTIONARY_LEVEL_NUMBERED: + return LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_STANDALONE: + return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_MEZZANINE: + return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_BASEMENT: + return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_SUB_BASEMENT: + return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_NUMBER: + return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_NO_NUMBER: + return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_HOUSE_NUMBER; + case DICTIONARY_PERSONAL_TITLE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_PLACE_NAME: + return LIBPOSTAL_ADDRESS_NAME; + case DICTIONARY_POST_OFFICE: + return LIBPOSTAL_ADDRESS_PO_BOX; + case DICTIONARY_POSTAL_CODE: + return LIBPOSTAL_ADDRESS_POSTAL_CODE; + case DICTIONARY_QUALIFIER: + return LIBPOSTAL_ADDRESS_TOPONYM; + case DICTIONARY_STAIRCASE: + return LIBPOSTAL_ADDRESS_STAIRCASE; + case DICTIONARY_STOPWORD: + return LIBPOSTAL_ADDRESS_ANY; + case DICTIONARY_STREET_TYPE: + return LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_UNIT_NUMBERED: + return LIBPOSTAL_ADDRESS_UNIT; + case DICTIONARY_UNIT_STANDALONE: + return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_UNIT; + case DICTIONARY_UNIT_DIRECTION: + return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_UNIT; + default: + return LIBPOSTAL_ADDRESS_NONE; + } +} + +inline uint32_t gazetter_edge_ignorable_components(uint16_t dictionary_id) { + switch (dictionary_id) { + // Pre/post directionals can be removed if there are non-phrase tokens + case DICTIONARY_DIRECTIONAL: + return LIBPOSTAL_ADDRESS_STREET; + default: + return LIBPOSTAL_ADDRESS_NONE; + } +} + +inline uint32_t gazetter_possible_root_components(uint16_t dictionary_id) { + switch (dictionary_id) { + case DICTIONARY_ACADEMIC_DEGREE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_PERSONAL_TITLE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_NUMBER: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_PLACE_NAME: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_QUALIFIER: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_SYNONYM: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_TOPONYM: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + default: + return LIBPOSTAL_ADDRESS_NONE; + } +} + +inline bool address_expansion_is_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) { + for (uint32_t j = 0; j < expansion.num_dictionaries; j++) { + uint16_t dictionary_id = expansion.dictionary_ids[j]; + if (gazetter_ignorable_components(dictionary_id) & address_components) { + return true; + } + } + return false; +} + +inline bool address_expansion_is_edge_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) { + for (uint32_t j = 0; j < expansion.num_dictionaries; j++) { + uint16_t dictionary_id = expansion.dictionary_ids[j]; + if (gazetter_edge_ignorable_components(dictionary_id) & address_components) { + return true; + } + } + return false; +} + +inline bool address_expansion_is_possible_root_for_components(address_expansion_t expansion, uint32_t address_components) { + for (uint32_t j = 0; j < expansion.num_dictionaries; j++) { + uint16_t dictionary_id = expansion.dictionary_ids[j]; + if (gazetter_possible_root_components(dictionary_id) & address_components) { + return true; + } + } + return false; +} + +bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t address_components) { + uint32_t expansion_index = phrase.data; + address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + if (expansions == NULL) return false; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions->a[i]; + + if (address_expansion_is_ignorable_for_components(expansion, address_components)) { + return true; + } + } + return false; +} + + +bool address_phrase_is_edge_ignorable_for_components(phrase_t phrase, uint32_t address_components) { + uint32_t expansion_index = phrase.data; + address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + if (expansions == NULL) return false; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions->a[i]; + + if (address_expansion_is_edge_ignorable_for_components(expansion, address_components)) { + return true; + } + } + return false; +} + + +bool address_phrase_is_possible_root_for_components(phrase_t phrase, uint32_t address_components) { + uint32_t expansion_index = phrase.data; + address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); + + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + if (expansions == NULL) return false; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions->a[i]; + + if (address_expansion_is_possible_root_for_components(expansion, address_components)) { + return true; + } + } + return false; +} + + + +bool address_phrase_contains_unambiguous_expansion(phrase_t phrase) { + address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); + if (value == NULL) return false; + + address_expansion_array *expansions = value->expansions; + if (expansions == NULL) return false; + + address_expansion_t *expansions_array = expansions->a; + + for (size_t i = 0; i < expansions->n; i++) { + address_expansion_t expansion = expansions_array[i]; + if (!address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION)) { + return true; + } + } + return false; +} + + + + +// Delete non-canonical phrases only + +string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) { char_array *key = NULL; log_debug("input=%s\n", str); - token_array *tokens = tokenize_keep_whitespace(str); + token_array *token_array = tokenize_keep_whitespace(str); - if (tokens == NULL) { + if (token_array == NULL) { return NULL; } size_t len = strlen(str); - log_debug("tokenized, num tokens=%zu\n", tokens->n); + token_t *tokens = token_array->a; + size_t num_tokens = token_array->n; + + log_debug("tokenized, num tokens=%zu\n", num_tokens); bool last_was_punctuation = false; phrase_language_array *phrases = NULL; phrase_array *lang_phrases = NULL; - for (size_t i = 0; i < options.num_languages; i++) { char *lang = options.languages[i]; log_debug("lang=%s\n", lang); - lang_phrases = search_address_dictionaries_tokens(str, tokens, lang); + lang_phrases = search_address_dictionaries_tokens(str, token_array, lang); if (lang_phrases == NULL) { log_debug("lang_phrases NULL\n"); @@ -494,7 +698,7 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t } - lang_phrases = search_address_dictionaries_tokens(str, tokens, ALL_LANGUAGES); + lang_phrases = search_address_dictionaries_tokens(str, token_array, ALL_LANGUAGES); if (lang_phrases != NULL) { phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n); @@ -526,6 +730,79 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN); + log_debug("phrase_option = %d\n", phrase_option); + + bool delete_phrases = phrase_option == DELETE_PHRASES; + bool expand_phrases = phrase_option == EXPAND_PHRASES; + + size_t num_phrases = phrases->n; + + bool have_non_phrase_tokens = false; + bool have_canonical_phrases = false; + bool have_ambiguous = false; + bool have_strictly_ignorable = false; + bool have_strictly_ignorable_abbreviation = false; + + size_t prev_phrase_end = 0; + + if (delete_phrases) { + for (size_t i = 0; i < num_phrases; i++) { + phrase_lang = phrases->a[i]; + phrase = phrase_lang.phrase; + + log_debug("phrase.start = %zu, prev_phrase_end = %zu\n", phrase.start, prev_phrase_end); + + token_t inter_token; + if (phrase.start > prev_phrase_end) { + for (size_t j = prev_phrase_end; j < phrase.start; j++) { + inter_token = tokens[j]; + if (!is_punctuation(inter_token.type) && !is_whitespace(inter_token.type)) { + log_debug("have_non_phrase_tokens\n"); + have_non_phrase_tokens = true; + break; + } + } + } + + if (i == num_phrases - 1 && phrase.start + phrase.len < num_tokens) { + for (size_t j = phrase.start + phrase.len; j < num_tokens; j++) { + inter_token = tokens[j]; + if (!is_punctuation(inter_token.type) && !is_whitespace(inter_token.type)) { + have_non_phrase_tokens = true; + break; + } + } + } + + bool phrase_is_ambiguous = address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION); + bool phrase_is_strictly_ignorable = address_phrase_is_ignorable_for_components(phrase, options.address_components) && !phrase_is_ambiguous; + bool phrase_is_canonical = address_phrase_has_canonical_interpretation(phrase); + + have_non_phrase_tokens = have_non_phrase_tokens || (!phrase_is_strictly_ignorable && !phrase_is_ambiguous); + have_strictly_ignorable = have_strictly_ignorable || phrase_is_strictly_ignorable; + have_strictly_ignorable_abbreviation = have_strictly_ignorable_abbreviation || (phrase_is_strictly_ignorable && !phrase_is_canonical); + if (have_strictly_ignorable_abbreviation) { + log_debug("have_strictly_ignorable=%zu, phrase_is_canonical=%zu\n", have_strictly_ignorable, phrase_is_canonical); + } + + have_canonical_phrases = have_canonical_phrases || (phrase_is_canonical && !phrase_is_ambiguous); + have_ambiguous = have_ambiguous || phrase_is_ambiguous; + + if (have_non_phrase_tokens) { + break; + } + + prev_phrase_end = phrase.start + phrase.len; + } + + + log_debug("have_non_phrase_tokens = %d\n", have_non_phrase_tokens); + log_debug("have_canonical_phrases = %d\n", have_canonical_phrases); + log_debug("have_ambiguous = %d\n", have_ambiguous); + log_debug("have_strictly_ignorable = %d\n", have_strictly_ignorable); + log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation); + } + for (size_t i = 0; i < phrases->n; i++) { phrase_lang = phrases->a[i]; @@ -550,54 +827,47 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t log_debug("start=%zu, end=%zu\n", start, end); for (size_t j = start; j < end; j++) { log_debug("Adding token %zu\n", j); - token_t token = tokens->a[j]; + token_t token = tokens[j]; if (is_punctuation(token.type)) { last_was_punctuation = true; continue; } if (token.type != WHITESPACE) { - if (phrase.start > 0 && last_was_punctuation && !last_added_was_whitespace) { + if ((phrase.start > 0 && last_was_punctuation) || (!last_added_was_whitespace && string_tree_num_strings(tree) > 0) ) { + log_debug("Adding space\n"); string_tree_add_string(tree, " "); string_tree_finalize_token(tree); } log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset); bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); + string_tree_finalize_token(tree); last_added_was_whitespace = false; - } else if (!last_added_was_whitespace) { + } else if (!delete_phrases && !last_added_was_whitespace && string_tree_num_strings(tree) > 0 ) { log_debug("Adding pre-phrase whitespace\n"); last_added_was_whitespace = true; string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); } else { continue; } last_was_punctuation = false; - string_tree_finalize_token(tree); } - if (phrase.start > 0 && start < end) { - token_t prev_token = tokens->a[phrase.start - 1]; - log_debug("last_added_was_whitespace=%d\n", last_added_was_whitespace); - if (!last_added_was_whitespace && phrase.start - 1 > 0 && (!is_ideographic(prev_token.type) || last_was_punctuation)) { - log_debug("Adding space III\n"); - string_tree_add_string(tree, " "); - last_added_was_whitespace = true; - string_tree_finalize_token(tree); - } - } + size_t added_expansions = 0; + token_t token; uint32_t expansion_index = phrase.data; address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); - token_t token; + bool expansion_valid_components = value->components & options.address_components; - size_t added_expansions = 0; - if ((value->components & options.address_components) > 0) { + if (expansion_valid_components) { key->n = namespace_len; for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { - token = tokens->a[j]; + token = tokens[j]; if (token.type != WHITESPACE) { char_array_cat_len(key, str + token.offset, token.len); last_added_was_whitespace = false; @@ -612,22 +882,175 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t address_expansion_array *expansions = value->expansions; if (expansions != NULL) { - for (size_t j = 0; j < expansions->n; j++) { - address_expansion_t expansion = expansions->a[j]; + bool current_phrase_have_ambiguous = address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION); + bool added_pre_phrase_space = false; + bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components); + bool current_phrase_have_edge_ignorable = false; - if ((expansion.address_components & options.address_components) == 0 && !address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION)) { + bool current_phrase_have_unambiguous = address_phrase_contains_unambiguous_expansion(phrase); + + /* + Edge phrase handling. This is primarily for handling pre-directionals/post-directionals + in English and other languages. + */ + bool skip_edge_phrase = false; + bool other_phrase_have_edge_ignorable = false; + + if (delete_phrases) { + phrase_language_t other_phrase_lang; + phrase_t other_phrase; + + log_debug("i = %zu, phrase.start = %u\n", i, phrase.start); + if (i == 0 && phrase.start == 0 && phrase.start + phrase.len < num_tokens) { + current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(phrase, options.address_components); + // Delete "E" in "E 125th St" + if (current_phrase_have_edge_ignorable) { + log_debug("edge-ignorable phrase [%u, %u]\n", phrase.start, phrase.start + phrase.len); + skip_edge_phrase = true; + } + + if (!skip_edge_phrase || !have_non_phrase_tokens) { + for (size_t other_i = i + 1; other_i < phrases->n; other_i++) { + other_phrase_lang = phrases->a[other_i]; + other_phrase = other_phrase_lang.phrase; + log_debug("phrase.start + phrase.len = %u\n", phrase.start + phrase.len); + log_debug("other_phrase.start = %u, other_phrase.len = %u, lang=%s\n", other_phrase.start, other_phrase.len, other_phrase_lang.language); + if (other_phrase.start >= phrase.start + phrase.len && string_equals(other_phrase_lang.language, phrase_lang.language)) { + if (other_phrase.start + other_phrase.len == num_tokens) { + skip_edge_phrase = false; + if (current_phrase_have_edge_ignorable) { + // don't delete the "E" in "E St" + log_debug("initial phrase is edge ignorable out of two phrases. Checking next phrase is ignorable.\n"); + skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); + } else { + log_debug("initial phrase is not edge-ignorable out of two phrases. Checking next phrase is edge ignorable.\n"); + // delete "Avenue" in "Avenue E" + other_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); + skip_edge_phrase = other_phrase_have_edge_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); + + } + } else { + // If we encounter an ignorable phrase + skip_edge_phrase = address_phrase_is_possible_root_for_components(other_phrase, options.address_components) && address_phrase_has_canonical_interpretation(other_phrase); + log_debug("phrase is possible root = %d\n", skip_edge_phrase); + } + break; + } + } + } + } else if (phrases->n > 1 && i == phrases->n - 1 && phrase.start + phrase.len == num_tokens && phrase.start > 0) { + current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(phrase, options.address_components); + if (current_phrase_have_edge_ignorable) { + log_debug("edge-ignorable phrase [%u, %u]\n", phrase.start, phrase.start + phrase.len); + skip_edge_phrase = true; + } + + log_debug("have_non_phrase_tokens = %d\n", have_non_phrase_tokens); + if (!skip_edge_phrase || !have_non_phrase_tokens) { + for (ssize_t other_j = i - 1; other_j >= 0; other_j--) { + other_phrase_lang = phrases->a[other_j]; + other_phrase = other_phrase_lang.phrase; + log_debug("phrase.start + phrase.len = %u\n", phrase.start + phrase.len); + log_debug("other_phrase.start = %u, other_phrase.len = %u, lang=%s\n", other_phrase.start, other_phrase.len, other_phrase_lang.language); + if (other_phrase.start + other_phrase.len <= phrase.start && string_equals(other_phrase_lang.language, phrase_lang.language)) { + if (other_phrase.start == 0) { + //other_phrase_invalid = address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !address_phrase_has_canonical_interpretation(other_phrase) && !address_phrase_is_possible_root_for_components(other_phrase, options.address_components); + skip_edge_phrase = false; + if (current_phrase_have_edge_ignorable) { + // don't delete the "E" in "Avenue E" + log_debug("final phrase is edge ignorable out of two phrases. Checking previous phrase is ignorable.\n"); + skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); + //skip_edge_phrase = !other_phrase_invalid; + } else { + log_debug("final phrase is not edge-ignorable out of two phrases. Checking previous phrase is edge ignorable.\n"); + // delete "St" in "E St" + other_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); + skip_edge_phrase = other_phrase_have_edge_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); + //skip_edge_phrase = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); + } + } + break; + } + } + } + } + } + + for (size_t j = 0; j < expansions->n; j++) { + if (skip_edge_phrase) { + log_debug("skip edge phrase\n"); continue; } - if (expansion.canonical_index != NULL_CANONICAL_INDEX) { + address_expansion_t expansion = expansions->a[j]; + + bool current_phrase_ignorable = false; + bool current_phrase_expandable = expand_phrases && expansion.canonical_index != NULL_CANONICAL_INDEX; + + bool is_ambiguous = address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION); + + if (delete_phrases) { + bool is_ignorable = address_expansion_is_ignorable_for_components(expansion, options.address_components); + bool is_canonical = expansion.canonical_index == NULL_CANONICAL_INDEX; + + log_debug("is_ignorable = %d, is_canonical = %d, is_ambiguous = %d, current_phrase_have_ambiguous = %d, current_phrase_have_unambiguous = %d, have_strictly_ignorable = %d, current_phrase_have_ignorable=%d\n", is_ignorable, is_canonical, is_ambiguous, current_phrase_have_ambiguous, current_phrase_have_unambiguous, have_strictly_ignorable, current_phrase_have_ignorable); + + current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous; + + // Edge phrase calculations from above + if (current_phrase_have_edge_ignorable || other_phrase_have_edge_ignorable) { + log_debug("current_phrase_have_edge_ignorable\n"); + log_debug("skip_edge_phrase = %d\n", skip_edge_phrase); + current_phrase_ignorable = skip_edge_phrase; + // Delete "Avenue" in "5th Avenue" + } else if (is_ignorable && is_canonical && !current_phrase_have_ambiguous) { + log_debug("is_ignorable && is_canonical && !current_phrase_have_ambiguous\n"); + current_phrase_ignorable = have_non_phrase_tokens || string_tree_num_tokens(tree) > 0; + log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); + // Delete "Ave" in "5th Ave" or "Pl" in "Park Pl S" + } else if (is_ignorable && !is_canonical && !is_ambiguous && !current_phrase_have_ambiguous) { + log_debug("is_ignorable && !is_canonical && !current_phrase_have_ambiguous\n"); + current_phrase_ignorable = have_non_phrase_tokens || have_canonical_phrases || have_ambiguous; + log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); + } else if (current_phrase_have_ambiguous && (have_non_phrase_tokens || have_canonical_phrases)) { + log_debug("have_non_phrase_tokens = %d, have_canonical_phrases = %d\n", have_non_phrase_tokens, have_canonical_phrases); + current_phrase_ignorable = is_ignorable || (is_ambiguous && have_non_phrase_tokens && current_phrase_have_ignorable && current_phrase_have_unambiguous); + + log_debug("current_phrase_have_ambiguous && have_non_phrase_tokens\n"); + log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); + } + + if (!current_phrase_ignorable && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 && !added_pre_phrase_space) { + log_debug("Adding space\n"); + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + last_added_was_whitespace = true; + added_pre_phrase_space = true; + } + + } + + if (current_phrase_ignorable) { + continue; + } + + if (delete_phrases) { + current_phrase_expandable = !current_phrase_ignorable; + } + + log_debug("expand_phrases = %d\n", expand_phrases); + + log_debug("expansion.canonical_index = %d\n", expansion.canonical_index); + + if (expansion.canonical_index != NULL_CANONICAL_INDEX && current_phrase_expandable) { + log_debug("expansion.canonical_index != NULL_CANONICAL_INDEX, delete_phrases = %d, phrase_option = %d\n", delete_phrases, phrase_option); char *canonical = address_dictionary_get_canonical(expansion.canonical_index); char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options); canonical = canonical_normalized != NULL ? canonical_normalized : canonical; - - if (phrase.start + phrase.len < tokens->n - 1) { - token_t next_token = tokens->a[phrase.start + phrase.len]; + if (phrase.start + phrase.len < num_tokens - 1) { + token_t next_token = tokens[phrase.start + phrase.len]; if (!is_numeric_token(next_token.type)) { log_debug("non-canonical phrase, adding canonical string\n"); string_tree_add_string(tree, canonical); @@ -643,18 +1066,17 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t } else { string_tree_add_string(tree, canonical); last_added_was_whitespace = false; - } if (canonical_normalized != NULL) { free(canonical_normalized); } - } else { + } else if (expansion.canonical_index == NULL_CANONICAL_INDEX || !current_phrase_expandable) { log_debug("canonical phrase, adding canonical string\n"); uint32_t start_index = cstring_array_start_token(tree->strings); for (size_t k = phrase.start; k < phrase.start + phrase.len; k++) { - token = tokens->a[k]; + token = tokens[k]; if (token.type != WHITESPACE) { cstring_array_append_string_len(tree->strings, str + token.offset, token.len); last_added_was_whitespace = false; @@ -665,19 +1087,30 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t } } cstring_array_terminate(tree->strings); + } else { + continue; } added_expansions++; } - } } - if (added_expansions == 0) { + log_debug("expansion_valid_components == %d\n", expansion_valid_components); + + if (added_expansions == 0 && (!delete_phrases || !expansion_valid_components)) { + if (!last_added_was_whitespace && string_tree_num_strings(tree) > 0) { + log_debug("Adding space\n"); + string_tree_add_string(tree, " "); + string_tree_finalize_token(tree); + last_added_was_whitespace = true; + } + uint32_t start_index = cstring_array_start_token(tree->strings); + for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) { - token = tokens->a[j]; + token = tokens[j]; if (token.type != WHITESPACE) { log_debug("Adding canonical token, %.*s\n", (int)token.len, str + token.offset); @@ -691,31 +1124,25 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t } - if (phrase.start + phrase.len < tokens->n - 1) { - token_t next_token = tokens->a[phrase.start + phrase.len + 1]; - if (next_token.type != WHITESPACE && !last_added_was_whitespace && !is_ideographic(next_token.type)) { - cstring_array_append_string(tree->strings, " "); - last_added_was_whitespace = true; - } - } - cstring_array_terminate(tree->strings); } - log_debug("i=%zu\n", i); - bool end_of_phrase = false; - if (i < phrases->n - 1) { - phrase_t next_phrase = phrases->a[i + 1].phrase; - end_of_phrase = (next_phrase.start != phrase.start || next_phrase.len != phrase.len); - } else { - end_of_phrase = true; - } + if (!delete_phrases || !expansion_valid_components || added_expansions > 0) { + log_debug("i=%zu\n", i); + bool end_of_phrase = false; + if (i < phrases->n - 1) { + phrase_t next_phrase = phrases->a[i + 1].phrase; + end_of_phrase = (next_phrase.start != phrase.start || next_phrase.len != phrase.len); + } else { + end_of_phrase = true; + } - log_debug("end_of_phrase=%d\n", end_of_phrase); - if (end_of_phrase) { - log_debug("finalize at i=%zu\n", i); - string_tree_finalize_token(tree); + log_debug("end_of_phrase=%d\n", end_of_phrase); + if (end_of_phrase) { + log_debug("finalize at i=%zu\n", i); + string_tree_finalize_token(tree); + } } start = phrase.start + phrase.len; @@ -725,11 +1152,11 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t char_array_destroy(key); - end = (int)tokens->n; + end = (int)num_tokens; - if (phrase.start + phrase.len > 0 && phrase.start + phrase.len <= end - 1) { - token_t next_token = tokens->a[phrase.start + phrase.len]; - if (next_token.type != WHITESPACE && !last_added_was_whitespace && !is_ideographic(next_token.type)) { + if (phrase.start + phrase.len > 0 && phrase.start + phrase.len <= end - 1 && !last_added_was_whitespace) { + token_t next_token = tokens[phrase.start + phrase.len]; + if (next_token.type != WHITESPACE && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 && !is_ideographic(next_token.type)) { log_debug("space after phrase\n"); string_tree_add_string(tree, " "); last_added_was_whitespace = true; @@ -740,7 +1167,7 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t for (size_t j = start; j < end; j++) { log_debug("On token %zu\n", j); - token_t token = tokens->a[j]; + token_t token = tokens[j]; if (is_punctuation(token.type)) { log_debug("last_was_punctuation\n"); last_was_punctuation = true; @@ -757,7 +1184,7 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); last_added_was_whitespace = false; - } else if (!last_added_was_whitespace) { + } else if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) { log_debug("Adding space IV\n"); string_tree_add_string(tree, " "); last_added_was_whitespace = true; @@ -773,10 +1200,10 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t } else { - - for (size_t j = 0; j < tokens->n; j++) { + log_debug("phrases NULL\n"); + for (size_t j = 0; j < num_tokens; j++) { log_debug("On token %zu\n", j); - token_t token = tokens->a[j]; + token_t token = tokens[j]; if (is_punctuation(token.type)) { log_debug("punctuation, skipping\n"); last_was_punctuation = true; @@ -809,12 +1236,11 @@ string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t phrase_language_array_destroy(phrases); } - token_array_destroy(tokens); + token_array_destroy(token_array); return tree; } - inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); @@ -895,7 +1321,7 @@ inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, tok } -void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options) { +void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) { size_t len = strlen(str); token_array *tokens = tokenize_keep_whitespace(str); string_tree_t *token_tree = string_tree_new_size(len); @@ -939,7 +1365,7 @@ void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings int ret; log_debug("Adding alternatives for single normalization\n"); - alternatives = add_string_alternatives(tokenized_str, options); + alternatives = add_string_alternatives_phrase_option(tokenized_str, options, phrase_option); log_debug("num strings = %" PRIu32 "\n", string_tree_num_strings(alternatives)); @@ -998,7 +1424,7 @@ void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings -char **expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { +char **expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) { options.address_components |= LIBPOSTAL_ADDRESS_ANY; uint64_t normalize_string_options = get_normalize_string_options(options); @@ -1028,7 +1454,7 @@ char **expand_address(char *input, libpostal_normalize_options_t options, size_t if (string_tree_num_strings(tree) == 1) { char *normalized = string_tree_get_alternative(tree, 0, 0); - expand_alternative(strings, unique_strings, normalized, options); + expand_alternative_phrase_option(strings, unique_strings, normalized, options, phrase_option); } else { log_debug("Adding alternatives for multiple normalizations\n"); @@ -1049,7 +1475,7 @@ char **expand_address(char *input, libpostal_normalize_options_t options, size_t char_array_terminate(temp_string); token = char_array_get_string(temp_string); log_debug("current permutation = %s\n", token); - expand_alternative(strings, unique_strings, token, options); + expand_alternative_phrase_option(strings, unique_strings, token, options, phrase_option); } string_tree_iterator_destroy(iter); @@ -1077,6 +1503,16 @@ char **expand_address(char *input, libpostal_normalize_options_t options, size_t } +char **expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { + return expand_address_phrase_option(input, options, n, EXPAND_PHRASES); +} + +char **expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) { + return expand_address_phrase_option(input, options, n, DELETE_PHRASES); +} + + + void expansion_array_destroy(char **expansions, size_t n) { for (size_t i = 0; i < n; i++) { free(expansions[i]); diff --git a/src/expand.h b/src/expand.h index 0e24cae4..0f961f81 100644 --- a/src/expand.h +++ b/src/expand.h @@ -38,15 +38,19 @@ bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, l bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options); bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options); -string_tree_t *add_string_alternatives(char *str, libpostal_normalize_options_t options); - bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options); void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options); -void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options); +typedef enum { + EXPAND_PHRASES, + KEEP_PHRASES, + DELETE_PHRASES +} expansion_phrase_option_t; + char **expand_address(char *input, libpostal_normalize_options_t options, size_t *n); +char **expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option); char **expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n); void expansion_array_destroy(char **expansions, size_t n); -#endif \ No newline at end of file +#endif From 9eef46adeece81547e945fa057e1babe82018b99 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 17:22:37 -0500 Subject: [PATCH 35/89] [expand] in cases like "Avenue D" where there are two phrases, one is ambiguous (and canonical) but not necessarily edge-ignorable (pre/post-directional), allow deletion of the other token (so "Avenue" in this case). Also allows skipping in cases where the language classifier may predict a second language with some small probability, such as French for a short string like "Avenue D" (in addition to English). If the token was ignorable in the highest probability language, ignore it in both. --- src/expand.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/src/expand.c b/src/expand.c index fc9cf572..d4e5dc2d 100644 --- a/src/expand.c +++ b/src/expand.c @@ -803,6 +803,8 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation); } + bool skipped_last_edge_phrase = false; + for (size_t i = 0; i < phrases->n; i++) { phrase_lang = phrases->a[i]; @@ -882,19 +884,21 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal address_expansion_array *expansions = value->expansions; if (expansions != NULL) { - bool current_phrase_have_ambiguous = address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION); + bool current_phrase_have_ambiguous = delete_phrases && address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION); bool added_pre_phrase_space = false; bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components); bool current_phrase_have_edge_ignorable = false; - bool current_phrase_have_unambiguous = address_phrase_contains_unambiguous_expansion(phrase); + bool current_phrase_have_canonical = delete_phrases && address_phrase_has_canonical_interpretation(phrase); + + bool current_phrase_have_unambiguous = delete_phrases && address_phrase_contains_unambiguous_expansion(phrase); /* Edge phrase handling. This is primarily for handling pre-directionals/post-directionals in English and other languages. */ bool skip_edge_phrase = false; - bool other_phrase_have_edge_ignorable = false; + bool other_phrase_is_ignorable = false; if (delete_phrases) { phrase_language_t other_phrase_lang; @@ -918,19 +922,21 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal if (other_phrase.start >= phrase.start + phrase.len && string_equals(other_phrase_lang.language, phrase_lang.language)) { if (other_phrase.start + other_phrase.len == num_tokens) { skip_edge_phrase = false; - if (current_phrase_have_edge_ignorable) { + if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) { // don't delete the "E" in "E St" log_debug("initial phrase is edge ignorable out of two phrases. Checking next phrase is ignorable.\n"); + skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); + log_debug("skip_edge_phrase = %d\n", skip_edge_phrase); } else { log_debug("initial phrase is not edge-ignorable out of two phrases. Checking next phrase is edge ignorable.\n"); // delete "Avenue" in "Avenue E" - other_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); - skip_edge_phrase = other_phrase_have_edge_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); + other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase)); + skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); } } else { - // If we encounter an ignorable phrase + // If we encounter an ignorable phrase like St and we're _not_ the end of the string e.g. "E St SE", this is probably a legit token instead of a pre-directional skip_edge_phrase = address_phrase_is_possible_root_for_components(other_phrase, options.address_components) && address_phrase_has_canonical_interpretation(other_phrase); log_debug("phrase is possible root = %d\n", skip_edge_phrase); } @@ -956,16 +962,15 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal if (other_phrase.start == 0) { //other_phrase_invalid = address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !address_phrase_has_canonical_interpretation(other_phrase) && !address_phrase_is_possible_root_for_components(other_phrase, options.address_components); skip_edge_phrase = false; - if (current_phrase_have_edge_ignorable) { + if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) { // don't delete the "E" in "Avenue E" log_debug("final phrase is edge ignorable out of two phrases. Checking previous phrase is ignorable.\n"); skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); - //skip_edge_phrase = !other_phrase_invalid; } else { log_debug("final phrase is not edge-ignorable out of two phrases. Checking previous phrase is edge ignorable.\n"); // delete "St" in "E St" - other_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); - skip_edge_phrase = other_phrase_have_edge_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); + other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase)); + skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); //skip_edge_phrase = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); } } @@ -976,10 +981,17 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } } + if (phrase.start == prev_phrase.start && phrase.len == prev_phrase.len && skipped_last_edge_phrase) { + skip_edge_phrase = true; + } + for (size_t j = 0; j < expansions->n; j++) { if (skip_edge_phrase) { + skipped_last_edge_phrase = true; log_debug("skip edge phrase\n"); continue; + } else { + skipped_last_edge_phrase = false; } address_expansion_t expansion = expansions->a[j]; @@ -998,7 +1010,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous; // Edge phrase calculations from above - if (current_phrase_have_edge_ignorable || other_phrase_have_edge_ignorable) { + if (current_phrase_have_edge_ignorable || other_phrase_is_ignorable) { log_debug("current_phrase_have_edge_ignorable\n"); log_debug("skip_edge_phrase = %d\n", skip_edge_phrase); current_phrase_ignorable = skip_edge_phrase; From 8b2a4d1ecf78a998e9dfc29f9c369d8cfeff721c Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 17:46:26 -0500 Subject: [PATCH 36/89] [api] adding libpostal_expand_address_root to the public API. This will attempt to delete tokens that can be safely ignored. It's deterministic and rule-based, but is informed by libpostal's fairly comprehensive dictionaries, and should work relatively well across languages for deduping purposes. --- src/libpostal.c | 4 ++++ src/libpostal.h | 1 + 2 files changed, 5 insertions(+) diff --git a/src/libpostal.c b/src/libpostal.c index 32d80331..f12d4898 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -48,6 +48,10 @@ char **libpostal_expand_address(char *input, libpostal_normalize_options_t optio return expand_address(input, options, n); } +char **libpostal_expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) { + return expand_address_root(input, options, n); +} + void libpostal_expansion_array_destroy(char **expansions, size_t n) { expansion_array_destroy(expansions, n); } diff --git a/src/libpostal.h b/src/libpostal.h index e88d5625..f088db72 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -138,6 +138,7 @@ typedef struct libpostal_normalize_options { LIBPOSTAL_EXPORT libpostal_normalize_options_t libpostal_get_default_options(void); LIBPOSTAL_EXPORT char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n); +LIBPOSTAL_EXPORT char **libpostal_expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n); LIBPOSTAL_EXPORT void libpostal_expansion_array_destroy(char **expansions, size_t n); From a1db4d773466470617b2daa0ed56c0ac4ae3648c Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 19:53:11 -0500 Subject: [PATCH 37/89] [expand/normalize] the split_alpha_from_numeric option now applies to both e.g. A1 and 1A since we now strip out ordinal suffixes prior to normalization --- src/expand.c | 2 +- src/normalize.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/expand.c b/src/expand.c index d4e5dc2d..622567d9 100644 --- a/src/expand.c +++ b/src/expand.c @@ -104,7 +104,7 @@ void add_normalized_strings_token(cstring_array *strings, char *str, token_t tok } } - if (is_numeric_token(token.type) && options.split_alpha_from_numeric && numeric_starts_with_alpha(str, token)) { + if (is_numeric_token(token.type) && options.split_alpha_from_numeric) { normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; normalize_token(strings, str, token, normalize_token_options); normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; diff --git a/src/normalize.c b/src/normalize.c index ff21af9b..3e218e9d 100644 --- a/src/normalize.c +++ b/src/normalize.c @@ -423,15 +423,18 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t next_char_len = utf8proc_iterate(ptr + char_len, len, &next_ch); int next_cat = utf8proc_category(next_ch); bool next_is_number = utf8_is_number(next_cat); + bool next_is_letter = utf8_is_letter(next_cat); bool is_full_stop = ch == FULL_STOP_CODEPOINT; + bool is_hyphen_between_letter_and_number = is_hyphen && ((next_is_number && last_was_letter) || (next_is_letter && last_was_number)); + if (is_hyphen && options & NORMALIZE_TOKEN_REPLACE_HYPHENS && (!(last_was_number && next_is_number) || options & NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS)) { char_array_append(array, " "); append_char = false; } else if (is_hyphen && options & NORMALIZE_TOKEN_DELETE_HYPHENS) { - append_char = false; + append_char = !is_hyphen_between_letter_and_number; } if ((is_hyphen || is_full_stop) && token.type == NUMERIC && options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && last_was_letter) { @@ -452,7 +455,7 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t append_char = false; } - if (options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && token.type == NUMERIC && last_was_letter && is_number && !alpha_numeric_split) { + if (options & NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC && token.type == NUMERIC && ((last_was_letter && is_number) || (last_was_number && is_letter)) && !alpha_numeric_split) { char_array_append(array, " "); alpha_numeric_split = true; } From 26a6d9684d83a9689d63d4e493e7d454d87c9af7 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 20:00:48 -0500 Subject: [PATCH 38/89] [test] adding tests for root-only expansions. Mostly English tests for the moment to deal with the various edge cases, but is also important for Spanish where "Calle" is so common that it's often omitted, same with French and "rue", etc. --- test/test_expand.c | 97 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 88 insertions(+), 9 deletions(-) diff --git a/test/test_expand.c b/test/test_expand.c index d97838ae..59ed9af7 100644 --- a/test/test_expand.c +++ b/test/test_expand.c @@ -8,14 +8,21 @@ SUITE(libpostal_expansion_tests); -static greatest_test_res test_expansion_contains(char *input, char *output, libpostal_normalize_options_t options) { +static greatest_test_res test_expansion_contains_phrase_option(char *input, char *output, libpostal_normalize_options_t options, bool root) { size_t num_expansions; - char **expansions = libpostal_expand_address(input, options, &num_expansions); + + char **expansions = NULL; + if (!root) { + expansions = libpostal_expand_address(input, options, &num_expansions); + } else { + expansions = libpostal_expand_address_root(input, options, &num_expansions); + } bool contains_expansion = false; char *expansion; for (size_t i = 0; i < num_expansions; i++) { expansion = expansions[i]; + printf("expansion = %s\n", expansion); if (string_equals(output, expansion)) { contains_expansion = true; break; @@ -38,15 +45,26 @@ static greatest_test_res test_expansion_contains(char *input, char *output, libp PASS(); } -static greatest_test_res test_expansion_contains_with_languages(char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) { +static greatest_test_res test_expansion_contains(char *input, char *output, libpostal_normalize_options_t options) { + bool root = false; + CHECK_CALL(test_expansion_contains_phrase_option(input, output, options, root)); + + PASS(); +} + +static greatest_test_res test_root_expansion_contains(char *input, char *output, libpostal_normalize_options_t options) { + bool root = true; + CHECK_CALL(test_expansion_contains_phrase_option(input, output, options, root)); + + PASS(); +} + +static greatest_test_res test_expansion_contains_phrase_option_with_languages(char *input, char *output, libpostal_normalize_options_t options, bool root, size_t num_languages, va_list args) { char **languages = NULL; size_t i; if (num_languages > 0) { - va_list args; - - va_start(args, num_languages); languages = malloc(sizeof(char *) * num_languages); char *lang; @@ -56,8 +74,6 @@ static greatest_test_res test_expansion_contains_with_languages(char *input, cha languages[i] = strdup(lang); } - va_end(args); - options.num_languages = num_languages; options.languages = (char **)languages; } else { @@ -65,7 +81,7 @@ static greatest_test_res test_expansion_contains_with_languages(char *input, cha options.num_languages = 0; } - CHECK_CALL(test_expansion_contains(input, output, options)); + CHECK_CALL(test_expansion_contains_phrase_option(input, output, options, root)); if (languages != NULL) { for (i = 0; i < num_languages; i++) { free(languages[i]); @@ -76,6 +92,36 @@ static greatest_test_res test_expansion_contains_with_languages(char *input, cha } + +static greatest_test_res test_expansion_contains_with_languages(char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) { + bool root = false; + if (num_languages > 0) { + va_list args; + va_start(args, num_languages); + CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args)); + va_end(args); + } else { + CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, NULL)); + } + PASS(); +} + + +static greatest_test_res test_root_expansion_contains_with_languages(char *input, char *output, libpostal_normalize_options_t options, size_t num_languages, ...) { + bool root = true; + if (num_languages > 0) { + va_list args; + va_start(args, num_languages); + CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, args)); + va_end(args); + } else { + CHECK_CALL(test_expansion_contains_phrase_option_with_languages(input, output, options, root, num_languages, NULL)); + } + PASS(); +} + + + TEST test_expansions(void) { libpostal_normalize_options_t options = libpostal_get_default_options(); @@ -91,6 +137,38 @@ TEST test_expansions(void) { PASS(); } +TEST test_street_root_expansions(void) { + libpostal_normalize_options_t options = libpostal_get_default_options(); + options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY; + + // English - normal cases + CHECK_CALL(test_root_expansion_contains("Malcolm X Blvd", "malcolm x", options)); + CHECK_CALL(test_root_expansion_contains("E 106th St", "106", options)); + CHECK_CALL(test_root_expansion_contains("S Park Ave", "park", options)); + CHECK_CALL(test_root_expansion_contains("Park South", "park", options)); + CHECK_CALL(test_root_expansion_contains("Rev Dr. MLK Dr S", "martin luther king junior", options)); + CHECK_CALL(test_root_expansion_contains("Rev Dr. Martin Luther King Jr Dr S", "martin luther king junior", options)); + CHECK_CALL(test_root_expansion_contains("East 6th Street", "6th", options)); + + // English - edge cases + CHECK_CALL(test_root_expansion_contains("Avenue B", "b", options)); + CHECK_CALL(test_root_expansion_contains("Avenue C", "c", options)); + CHECK_CALL(test_root_expansion_contains("Avenue D", "d", options)); + CHECK_CALL(test_root_expansion_contains("Avenue E", "e", options)); + CHECK_CALL(test_root_expansion_contains("Avenue N", "n", options)); + CHECK_CALL(test_root_expansion_contains("U St SE", "u", options)); + CHECK_CALL(test_root_expansion_contains("S Park", "park", options)); + CHECK_CALL(test_root_expansion_contains("Park S", "park", options)); + CHECK_CALL(test_root_expansion_contains("Avenue Rd", "avenue", options)); + CHECK_CALL(test_root_expansion_contains("Broadway", "broadway", options)); + CHECK_CALL(test_root_expansion_contains("E Broadway", "east", options)); + + // Spanish + CHECK_CALL(test_root_expansion_contains("C/ Ocho", "8", options)); + PASS(); +} + + TEST test_expansions_language_classifier(void) { libpostal_normalize_options_t options = libpostal_get_default_options(); @@ -132,6 +210,7 @@ SUITE(libpostal_expansion_tests) { } RUN_TEST(test_expansions); + RUN_TEST(test_street_root_expansions); RUN_TEST(test_expansions_language_classifier); RUN_TEST(test_expansions_no_options); From bfdb6b8f87cc1cae9ba47870ff23deae0bb8ba51 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 20:17:01 -0500 Subject: [PATCH 39/89] [test] adding header to fix warning --- test/test_expand.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_expand.c b/test/test_expand.c index 59ed9af7..2b211728 100644 --- a/test/test_expand.c +++ b/test/test_expand.c @@ -4,6 +4,7 @@ #include #include "greatest.h" +#include "../src/string_utils.h" #include "../src/libpostal.h" SUITE(libpostal_expansion_tests); @@ -168,7 +169,6 @@ TEST test_street_root_expansions(void) { PASS(); } - TEST test_expansions_language_classifier(void) { libpostal_normalize_options_t options = libpostal_get_default_options(); From 1d22da603f5970ba796b041f4f76dcfd5e98f31c Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 20:17:28 -0500 Subject: [PATCH 40/89] [test] house number expansion tests --- test/test_expand.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/test_expand.c b/test/test_expand.c index 2b211728..dffd6785 100644 --- a/test/test_expand.c +++ b/test/test_expand.c @@ -169,6 +169,23 @@ TEST test_street_root_expansions(void) { PASS(); } + +TEST test_house_number_root_expansions(void) { + libpostal_normalize_options_t options = libpostal_get_default_options(); + options.address_components = LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY; + + // English - normal cases + CHECK_CALL(test_root_expansion_contains("1A", "1 a", options)); + CHECK_CALL(test_root_expansion_contains("A1", "a 1", options)); + CHECK_CALL(test_root_expansion_contains("1", "1", options)); + CHECK_CALL(test_root_expansion_contains_with_languages("# 1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("No. 1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("House No. 1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("House #1", "1", options, 1, "en")); + + PASS(); +} + TEST test_expansions_language_classifier(void) { libpostal_normalize_options_t options = libpostal_get_default_options(); @@ -211,6 +228,7 @@ SUITE(libpostal_expansion_tests) { RUN_TEST(test_expansions); RUN_TEST(test_street_root_expansions); + RUN_TEST(test_house_number_root_expansions); RUN_TEST(test_expansions_language_classifier); RUN_TEST(test_expansions_no_options); From 727469b7367de0a7092c48a4ff7e16824b11a25c Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 21:57:21 -0500 Subject: [PATCH 41/89] [expand] no longer delete phrases in cases like "PH 1" for units, where there's a phrase that can accompany numbered units and thus be ignored similar to "Apt 1" but that phrase may also be a qualifier (i.e. Apt 1 and Penthouse 1 are not the same) --- src/expand.c | 168 +++++++++++++++++++++++++++++---------------------- 1 file changed, 97 insertions(+), 71 deletions(-) diff --git a/src/expand.c b/src/expand.c index 622567d9..8bc6269d 100644 --- a/src/expand.c +++ b/src/expand.c @@ -449,7 +449,7 @@ bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, } -inline uint32_t gazetter_ignorable_components(uint16_t dictionary_id) { +inline uint32_t gazetteer_ignorable_components(uint16_t dictionary_id) { switch (dictionary_id) { case DICTIONARY_ACADEMIC_DEGREE: return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; @@ -468,17 +468,17 @@ inline uint32_t gazetter_ignorable_components(uint16_t dictionary_id) { case DICTIONARY_LEVEL_NUMBERED: return LIBPOSTAL_ADDRESS_LEVEL; case DICTIONARY_LEVEL_STANDALONE: - return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL; + return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY); case DICTIONARY_LEVEL_MEZZANINE: - return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL; + return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL| LIBPOSTAL_ADDRESS_ANY); case DICTIONARY_LEVEL_BASEMENT: - return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL; + return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY); case DICTIONARY_LEVEL_SUB_BASEMENT: - return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_LEVEL; + return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY); case DICTIONARY_NUMBER: return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_NO_NUMBER: - return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_HOUSE_NUMBER; + return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY); case DICTIONARY_PERSONAL_TITLE: return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_PLACE_NAME: @@ -498,15 +498,15 @@ inline uint32_t gazetter_ignorable_components(uint16_t dictionary_id) { case DICTIONARY_UNIT_NUMBERED: return LIBPOSTAL_ADDRESS_UNIT; case DICTIONARY_UNIT_STANDALONE: - return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_UNIT; + return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY); case DICTIONARY_UNIT_DIRECTION: - return LIBPOSTAL_ADDRESS_ANY ^ LIBPOSTAL_ADDRESS_UNIT; + return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY); default: return LIBPOSTAL_ADDRESS_NONE; } } -inline uint32_t gazetter_edge_ignorable_components(uint16_t dictionary_id) { +inline uint32_t gazetteer_edge_ignorable_components(uint16_t dictionary_id) { switch (dictionary_id) { // Pre/post directionals can be removed if there are non-phrase tokens case DICTIONARY_DIRECTIONAL: @@ -516,7 +516,25 @@ inline uint32_t gazetter_edge_ignorable_components(uint16_t dictionary_id) { } } -inline uint32_t gazetter_possible_root_components(uint16_t dictionary_id) { +inline uint32_t gazetteer_specifier_components(uint16_t dictionary_id) { + switch (dictionary_id) { + case DICTIONARY_LEVEL_STANDALONE: + return LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_MEZZANINE: + return LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_BASEMENT: + return LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_LEVEL_SUB_BASEMENT: + return LIBPOSTAL_ADDRESS_LEVEL; + case DICTIONARY_UNIT_STANDALONE: + return LIBPOSTAL_ADDRESS_UNIT; + default: + return LIBPOSTAL_ADDRESS_NONE; + } +} + + +inline uint32_t gazetteer_possible_root_components(uint16_t dictionary_id) { switch (dictionary_id) { case DICTIONARY_ACADEMIC_DEGREE: return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; @@ -537,37 +555,58 @@ inline uint32_t gazetter_possible_root_components(uint16_t dictionary_id) { } } -inline bool address_expansion_is_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) { +typedef enum { + GAZETTEER_MATCH_IGNORABLE, + GAZETTEER_MATCH_EDGE_IGNORABLE, + GAZETTEER_MATCH_POSSIBLE_ROOT, + GAZETTEER_MATCH_SPECIFIER +} gazetteer_match_type_t; + + +inline bool address_expansion_matches_type_for_components(address_expansion_t expansion, uint32_t address_components, gazetteer_match_type_t match_type) { for (uint32_t j = 0; j < expansion.num_dictionaries; j++) { uint16_t dictionary_id = expansion.dictionary_ids[j]; - if (gazetter_ignorable_components(dictionary_id) & address_components) { + uint32_t components = 0; + switch (match_type) { + case GAZETTEER_MATCH_IGNORABLE: + components = gazetteer_ignorable_components(dictionary_id); + break; + case GAZETTEER_MATCH_EDGE_IGNORABLE: + components = gazetteer_edge_ignorable_components(dictionary_id); + break; + case GAZETTEER_MATCH_POSSIBLE_ROOT: + components = gazetteer_possible_root_components(dictionary_id); + break; + case GAZETTEER_MATCH_SPECIFIER: + components = gazetteer_specifier_components(dictionary_id); + break; + default: + break; + } + if (components & address_components) { return true; } } return false; } +inline bool address_expansion_is_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) { + return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_IGNORABLE); +} + inline bool address_expansion_is_edge_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) { - for (uint32_t j = 0; j < expansion.num_dictionaries; j++) { - uint16_t dictionary_id = expansion.dictionary_ids[j]; - if (gazetter_edge_ignorable_components(dictionary_id) & address_components) { - return true; - } - } - return false; + return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE); } inline bool address_expansion_is_possible_root_for_components(address_expansion_t expansion, uint32_t address_components) { - for (uint32_t j = 0; j < expansion.num_dictionaries; j++) { - uint16_t dictionary_id = expansion.dictionary_ids[j]; - if (gazetter_possible_root_components(dictionary_id) & address_components) { - return true; - } - } - return false; + return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT); } -bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t address_components) { +inline bool address_expansion_is_specifier_for_components(address_expansion_t expansion, uint32_t address_components) { + return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_SPECIFIER); +} + +bool address_phrase_matches_type_for_components(phrase_t phrase, uint32_t address_components, gazetteer_match_type_t match_type) { uint32_t expansion_index = phrase.data; address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); @@ -579,54 +618,29 @@ bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t addres for (size_t i = 0; i < expansions->n; i++) { address_expansion_t expansion = expansions->a[i]; - if (address_expansion_is_ignorable_for_components(expansion, address_components)) { + if (address_expansion_matches_type_for_components(expansion, address_components, match_type)) { return true; } } return false; } +inline bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_IGNORABLE); +} -bool address_phrase_is_edge_ignorable_for_components(phrase_t phrase, uint32_t address_components) { - uint32_t expansion_index = phrase.data; - address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); - - if (value == NULL) return false; - - address_expansion_array *expansions = value->expansions; - if (expansions == NULL) return false; - - for (size_t i = 0; i < expansions->n; i++) { - address_expansion_t expansion = expansions->a[i]; - - if (address_expansion_is_edge_ignorable_for_components(expansion, address_components)) { - return true; - } - } - return false; +inline bool address_phrase_is_edge_ignorable_for_components(phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE); } -bool address_phrase_is_possible_root_for_components(phrase_t phrase, uint32_t address_components) { - uint32_t expansion_index = phrase.data; - address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); - - if (value == NULL) return false; - - address_expansion_array *expansions = value->expansions; - if (expansions == NULL) return false; - - for (size_t i = 0; i < expansions->n; i++) { - address_expansion_t expansion = expansions->a[i]; - - if (address_expansion_is_possible_root_for_components(expansion, address_components)) { - return true; - } - } - return false; +inline bool address_phrase_is_possible_root_for_components(phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT); } - +inline bool address_phrase_is_specifier_for_components(phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_SPECIFIER); +} bool address_phrase_contains_unambiguous_expansion(phrase_t phrase) { address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); @@ -646,9 +660,6 @@ bool address_phrase_contains_unambiguous_expansion(phrase_t phrase) { return false; } - - - // Delete non-canonical phrases only string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) { @@ -889,8 +900,11 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components); bool current_phrase_have_edge_ignorable = false; + bool current_phrase_have_specifier = delete_phrases && address_phrase_is_specifier_for_components(phrase, options.address_components); bool current_phrase_have_canonical = delete_phrases && address_phrase_has_canonical_interpretation(phrase); + log_debug("current_phrase_have_specifier = %d\n", current_phrase_have_specifier); + bool current_phrase_have_unambiguous = delete_phrases && address_phrase_contains_unambiguous_expansion(phrase); /* @@ -1009,11 +1023,20 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous; + if (!is_canonical) { + char *canon = address_dictionary_get_canonical(expansion.canonical_index); + log_debug("canonical = %s\n", canon); + } + // Edge phrase calculations from above if (current_phrase_have_edge_ignorable || other_phrase_is_ignorable) { log_debug("current_phrase_have_edge_ignorable\n"); log_debug("skip_edge_phrase = %d\n", skip_edge_phrase); current_phrase_ignorable = skip_edge_phrase; + // Don't delete "PH" in "PH 1" for unit expansions + } else if (is_ignorable && have_non_phrase_tokens && current_phrase_have_specifier) { + log_debug("current_phrase_have_specifier\n"); + current_phrase_ignorable = false; // Delete "Avenue" in "5th Avenue" } else if (is_ignorable && is_canonical && !current_phrase_have_ambiguous) { log_debug("is_ignorable && is_canonical && !current_phrase_have_ambiguous\n"); @@ -1026,10 +1049,12 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); } else if (current_phrase_have_ambiguous && (have_non_phrase_tokens || have_canonical_phrases)) { log_debug("have_non_phrase_tokens = %d, have_canonical_phrases = %d\n", have_non_phrase_tokens, have_canonical_phrases); - current_phrase_ignorable = is_ignorable || (is_ambiguous && have_non_phrase_tokens && current_phrase_have_ignorable && current_phrase_have_unambiguous); + current_phrase_ignorable = is_ignorable || (current_phrase_have_ambiguous && have_non_phrase_tokens && current_phrase_have_ignorable && current_phrase_have_unambiguous); log_debug("current_phrase_have_ambiguous && have_non_phrase_tokens\n"); log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); + } else { + log_debug("none of the above\n"); } if (!current_phrase_ignorable && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 && !added_pre_phrase_space) { @@ -1064,11 +1089,11 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal if (phrase.start + phrase.len < num_tokens - 1) { token_t next_token = tokens[phrase.start + phrase.len]; if (!is_numeric_token(next_token.type)) { - log_debug("non-canonical phrase, adding canonical string\n"); + log_debug("non-canonical phrase, adding canonical string: %s\n", canonical); string_tree_add_string(tree, canonical); last_added_was_whitespace = false; } else { - log_debug("adding canonical with cstring_array methods\n"); + log_debug("adding canonical with cstring_array methods: %s\n", canonical); uint32_t start_index = cstring_array_start_token(tree->strings); cstring_array_append_string(tree->strings, canonical); cstring_array_append_string(tree->strings, " "); @@ -1076,6 +1101,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal cstring_array_terminate(tree->strings); } } else { + log_debug("adding canonical: %s\n", canonical); string_tree_add_string(tree, canonical); last_added_was_whitespace = false; } @@ -1223,7 +1249,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } if (token.type != WHITESPACE) { - if (last_was_punctuation && !last_added_was_whitespace) { + if (last_was_punctuation && !last_added_was_whitespace && string_tree_num_strings(tree) > 0) { log_debug("Adding space V\n"); string_tree_add_string(tree, " "); string_tree_finalize_token(tree); @@ -1231,7 +1257,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); last_added_was_whitespace = false; - } else if (!last_added_was_whitespace) { + } else if (!last_added_was_whitespace && string_tree_num_strings(tree) > 0) { log_debug("Adding space VI\n"); string_tree_add_string(tree, " "); last_added_was_whitespace = true; From f7326e52f6ea80249e8cefa56e55cdea5dd61b4e Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 22:00:03 -0500 Subject: [PATCH 42/89] [test] level expansion tests --- test/test_expand.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/test/test_expand.c b/test/test_expand.c index dffd6785..4a989d68 100644 --- a/test/test_expand.c +++ b/test/test_expand.c @@ -186,6 +186,36 @@ TEST test_house_number_root_expansions(void) { PASS(); } +TEST test_level_root_expansions(void) { + libpostal_normalize_options_t options = libpostal_get_default_options(); + options.address_components = LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY; + + // English - normal cases + CHECK_CALL(test_root_expansion_contains_with_languages("1st Fl", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("1st Floor", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("First Fl", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("First Floor", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("2nd Fl", "2", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("2nd Floor", "2", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Second Fl", "2", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Second Floor", "2", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Fl #1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Fl No. 1", "1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Floor No. 1", "1", options, 1, "en")); + + // Specifiers + CHECK_CALL(test_root_expansion_contains_with_languages("SB 1", "sub basement 1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Bsmt", "basement", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Bsmt 1", "basement 1", options, 1, "en")); + + CHECK_CALL(test_root_expansion_contains_with_languages("1G", "1 ground", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("G", "ground", options, 1, "en")); + + PASS(); +} + + + TEST test_expansions_language_classifier(void) { libpostal_normalize_options_t options = libpostal_get_default_options(); @@ -229,6 +259,7 @@ SUITE(libpostal_expansion_tests) { RUN_TEST(test_expansions); RUN_TEST(test_street_root_expansions); RUN_TEST(test_house_number_root_expansions); + RUN_TEST(test_level_root_expansions); RUN_TEST(test_expansions_language_classifier); RUN_TEST(test_expansions_no_options); From 27f4eb27214a950619167c7dc259ca83d2174ca9 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 22:01:30 -0500 Subject: [PATCH 43/89] [test] unit expansion tests --- test/test_expand.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/test/test_expand.c b/test/test_expand.c index 4a989d68..34195f44 100644 --- a/test/test_expand.c +++ b/test/test_expand.c @@ -214,7 +214,33 @@ TEST test_level_root_expansions(void) { PASS(); } +TEST test_unit_root_expansions(void) { + libpostal_normalize_options_t options = libpostal_get_default_options(); + options.address_components = LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY; + // English - normal cases + CHECK_CALL(test_root_expansion_contains_with_languages("1A", "1 a", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("A1", "a 1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Apt 101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Apt No 101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Apt #101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Apartment 101", "101", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Apartment #101", "101", options, 1, "en")); + + // Specifiers + CHECK_CALL(test_root_expansion_contains_with_languages("PH 1", "penthouse 1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("PH1", "penthouse 1", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("Penthouse 1", "penthouse 1", options, 1, "en")); + + CHECK_CALL(test_root_expansion_contains_with_languages("1L", "1l", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("1L", "1 left", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("1F", "1f", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("1F", "1f", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("1R", "1r", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("1R", "1r", options, 1, "en")); + + PASS(); +} TEST test_expansions_language_classifier(void) { libpostal_normalize_options_t options = libpostal_get_default_options(); @@ -260,6 +286,7 @@ SUITE(libpostal_expansion_tests) { RUN_TEST(test_street_root_expansions); RUN_TEST(test_house_number_root_expansions); RUN_TEST(test_level_root_expansions); + RUN_TEST(test_unit_root_expansions); RUN_TEST(test_expansions_language_classifier); RUN_TEST(test_expansions_no_options); From f63a9cc579b184e31aeb0825bcfc26740444a5fc Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 22:12:12 -0500 Subject: [PATCH 44/89] [expand] adding number phrases as ignorable in PO boxes --- src/expand.c | 2 +- src/gazetteer_data.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/expand.c b/src/expand.c index 8bc6269d..5f8b9674 100644 --- a/src/expand.c +++ b/src/expand.c @@ -476,7 +476,7 @@ inline uint32_t gazetteer_ignorable_components(uint16_t dictionary_id) { case DICTIONARY_LEVEL_SUB_BASEMENT: return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY); case DICTIONARY_NUMBER: - return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STREET; + return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_NO_NUMBER: return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY); case DICTIONARY_PERSONAL_TITLE: diff --git a/src/gazetteer_data.c b/src/gazetteer_data.c index 0c23759a..444c225e 100644 --- a/src/gazetteer_data.c +++ b/src/gazetteer_data.c @@ -25,7 +25,7 @@ gazetteer_t gazetteer_config[] = { {DICTIONARY_NAMED_ORGANIZATION, LIBPOSTAL_ADDRESS_NAME}, {DICTIONARY_NAMED_PERSON, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET}, {DICTIONARY_NO_NUMBER, LIBPOSTAL_ADDRESS_HOUSE_NUMBER}, - {DICTIONARY_NUMBER, LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE}, + {DICTIONARY_NUMBER, LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE}, {DICTIONARY_PERSONAL_SUFFIX, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET}, {DICTIONARY_PERSONAL_TITLE, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET}, {DICTIONARY_PLACE_NAME, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET}, From ff3c7ab3b6fc1ec349f20fc6dfaf0ee8785c2437 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 17 Dec 2017 22:12:37 -0500 Subject: [PATCH 45/89] [test] PO box expansion tests --- test/test_expand.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/test_expand.c b/test/test_expand.c index 34195f44..6436fb92 100644 --- a/test/test_expand.c +++ b/test/test_expand.c @@ -242,6 +242,17 @@ TEST test_unit_root_expansions(void) { PASS(); } + +TEST test_po_box_root_expansions(void) { + libpostal_normalize_options_t options = libpostal_get_default_options(); + options.address_components = LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_ANY; + + CHECK_CALL(test_root_expansion_contains_with_languages("PO Box 1234", "1234", options, 1, "en")); + CHECK_CALL(test_root_expansion_contains_with_languages("PO Box #1234", "1234", options, 1, "en")); + + PASS(); +} + TEST test_expansions_language_classifier(void) { libpostal_normalize_options_t options = libpostal_get_default_options(); @@ -287,6 +298,7 @@ SUITE(libpostal_expansion_tests) { RUN_TEST(test_house_number_root_expansions); RUN_TEST(test_level_root_expansions); RUN_TEST(test_unit_root_expansions); + RUN_TEST(test_po_box_root_expansions); RUN_TEST(test_expansions_language_classifier); RUN_TEST(test_expansions_no_options); From d03ce4e058a73f42da8c57136d6e61b3d3349783 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 18 Dec 2017 18:17:16 -0500 Subject: [PATCH 46/89] [expand] remove blank expansions and strip spaces --- src/expand.c | 16 +++++++++++++++- src/string_utils.h | 2 ++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/expand.c b/src/expand.c index 5f8b9674..80b4250e 100644 --- a/src/expand.c +++ b/src/expand.c @@ -1428,13 +1428,27 @@ void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) * char_array_terminate(temp_string); token = char_array_get_string(temp_string); + + size_t token_len = strlen(token); + + if (token_len == 0) continue; + + size_t left_spaces = string_left_spaces_len(token, token_len); + size_t right_spaces = string_right_spaces_len(token, token_len); + + if (left_spaces + right_spaces == token_len) { + continue; + } + log_debug("full string=%s\n", token); khiter_t k = kh_get(str_set, unique_strings, token); if (k == kh_end(unique_strings)) { + char *dupe_token = strndup(str + left_spaces, len - left_spaces - right_spaces); + log_debug("doing postprocessing\n"); add_postprocessed_string(strings, token, options); - k = kh_put(str_set, unique_strings, strdup(token), &ret); + k = kh_put(str_set, unique_strings, dupe_token, &ret); } log_debug("iter->remaining = %d\n", iter->remaining); diff --git a/src/string_utils.h b/src/string_utils.h index a94f1d93..873a670a 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -119,6 +119,8 @@ ssize_t string_next_period(char *str); bool string_contains_period_len(char *str, size_t len); bool string_contains_period(char *str); +size_t string_left_spaces_len(char *str, size_t len); +size_t string_right_spaces_len(char *str, size_t len); char *string_trim(char *str); size_t string_hyphen_prefix_len(char *str, size_t len); From e432243256e74d581db2eaa689ae5f7bca204a5d Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 19 Dec 2017 15:11:47 -0500 Subject: [PATCH 47/89] [dictionaries] adding "7 11" as a name for 7-eleven, even though it's completely numeric. Only affects the house/name component in deduping, so should be fine --- resources/dictionaries/all/chains.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/dictionaries/all/chains.txt b/resources/dictionaries/all/chains.txt index 5974e524..8eeca1b4 100644 --- a/resources/dictionaries/all/chains.txt +++ b/resources/dictionaries/all/chains.txt @@ -1,4 +1,4 @@ -7-eleven|7 eleven|7-11|seven-eleven|seven eleven|seveneleven|seven-11|seven 11|7-elevens|7 elevens|7-11s|seven-elevens|seven elevens|sevenelevens|seven-11s|seven 11s|sevel +7-eleven|7 eleven|7-11|seven-eleven|seven eleven|seveneleven|seven-11|seven 11|7-elevens|7 elevens|7-11s|seven-elevens|seven elevens|sevenelevens|seven-11s|seven 11s|sevel|7 11 a&w|a & w|a and w|a&ws|a & ws|a and ws|a&w restaurants|a & w restaurants|a and w restaurants ace hardware|ace hardwares adidas From dfc9064b0f8291c894c6b04ed4f723c7162444e1 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 23 Dec 2017 18:13:46 -0500 Subject: [PATCH 48/89] [dictionaries] adding Stores to place names dictionary --- resources/dictionaries/en/place_names.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/resources/dictionaries/en/place_names.txt b/resources/dictionaries/en/place_names.txt index 004adcc6..33290e82 100644 --- a/resources/dictionaries/en/place_names.txt +++ b/resources/dictionaries/en/place_names.txt @@ -267,6 +267,7 @@ stadium station|sta|stn steakhouse store|stor +stores studio studios subdivision From 7d42c94b199df114f527a832d15b0902738ae34a Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 23 Dec 2017 18:14:14 -0500 Subject: [PATCH 49/89] [dictionaries] adding "for" to English stopword dictionaries --- resources/dictionaries/en/stopwords.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/resources/dictionaries/en/stopwords.txt b/resources/dictionaries/en/stopwords.txt index b033a3d4..3bb221cf 100644 --- a/resources/dictionaries/en/stopwords.txt +++ b/resources/dictionaries/en/stopwords.txt @@ -3,6 +3,7 @@ all at between|betw|btwn|btw|btween|b / t by +for in of on From 1fd5433bc5e605ec4bec5fcc0e2af36bb679e6e9 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 23 Dec 2017 19:36:03 -0500 Subject: [PATCH 50/89] [dictionaries] adding associates/association to company types --- resources/dictionaries/en/company_types.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/resources/dictionaries/en/company_types.txt b/resources/dictionaries/en/company_types.txt index 3af64e20..5d2f36fa 100644 --- a/resources/dictionaries/en/company_types.txt +++ b/resources/dictionaries/en/company_types.txt @@ -1,3 +1,5 @@ +associates|assoc +association|assoc bank b corporation|b corp|bcorp charitable incorporated organization|cio|c i o @@ -34,7 +36,7 @@ limited liability limited partnership|lllp|l l l p limited liability partnership|llp|l l p limited partnership|lp|l p look through company|look through co|lookthrough company|lookthrough co|ltc -national association|na|n a +national association|na|n a|nat assoc|natl assoc national trust and savings association|national trust & savings association|nt & sa|nt and sa|nt sa|ntsa no liability|nl|n l nonprofit|non profit From 03c89bcf3cf8570ec612fb95ebed542d5dfe4fde Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 23 Dec 2017 19:49:05 -0500 Subject: [PATCH 51/89] [dictionaries] adding "a" to English stopwords, "service" and "services" to English place names --- resources/dictionaries/en/place_names.txt | 2 ++ resources/dictionaries/en/stopwords.txt | 1 + 2 files changed, 3 insertions(+) diff --git a/resources/dictionaries/en/place_names.txt b/resources/dictionaries/en/place_names.txt index 33290e82..58baf28b 100644 --- a/resources/dictionaries/en/place_names.txt +++ b/resources/dictionaries/en/place_names.txt @@ -252,6 +252,8 @@ salon sanctuary|sanct sauna secondary school +service|svc +services|svcs|svc shelter sheriff's department|sherrifs department|sheriff's dept|sherrifs dept sherrif's office|sherffis office|sheriff's ofc|sheriffs ofc diff --git a/resources/dictionaries/en/stopwords.txt b/resources/dictionaries/en/stopwords.txt index 3bb221cf..c88da481 100644 --- a/resources/dictionaries/en/stopwords.txt +++ b/resources/dictionaries/en/stopwords.txt @@ -1,3 +1,4 @@ +a and|& all at From 3e554b8033755ff2e9a34579373bacc9282d097d Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 24 Dec 2017 00:53:24 -0500 Subject: [PATCH 52/89] [dictionaries] adding ambiguous expansions in English --- .../dictionaries/en/ambiguous_expansions.txt | 79 ++++++++++++++++++- 1 file changed, 76 insertions(+), 3 deletions(-) diff --git a/resources/dictionaries/en/ambiguous_expansions.txt b/resources/dictionaries/en/ambiguous_expansions.txt index b12bd311..f678ccb1 100644 --- a/resources/dictionaries/en/ambiguous_expansions.txt +++ b/resources/dictionaries/en/ambiguous_expansions.txt @@ -2,31 +2,68 @@ aat act ab -al +abby ak +al +alee +ally +aly ar az +ant +app +apt arc +art +arty +ave +avens +aves +ba bc +bot +byu c ca +carp +cause +ce co +col +con +coop +cor +cowy ct de dc +div +divers d +doc +dup e +elb +ex f +fit fl +form +fry g ga +gen +gra h hi +hon i id il +imp in ia +is j jbt k @@ -34,13 +71,22 @@ ks ky l la +lit +low +lynn m ma me mb md +mem mi +miss +mid +mil +mun mn +mr ms mo mt @@ -67,38 +113,65 @@ oh on ok or +out p pa +pass pe pei +plat +pur q qc qld +quad r +ra +ran +rep +reps +rev ri ro +row +rowy s +sa sc sd se +sec +sect +sen +sh +shun sk sw t tas +thick +thro tn +tri tx +tun u +up ut un v -vic -vt va +via +vic +vill +vis +vt w wa wv wi wy +wyn x y yt From 4e3d868bd02709038c85a45e01a2e002b7be5aa5 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 24 Dec 2017 01:45:50 -0500 Subject: [PATCH 53/89] [parser] adding label constants to address_parser header --- src/address_parser.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/address_parser.h b/src/address_parser.h index 4c5e699f..b059a246 100644 --- a/src/address_parser.h +++ b/src/address_parser.h @@ -105,7 +105,14 @@ typedef enum { #define ADDRESS_PARSER_LABEL_HOUSE "house" #define ADDRESS_PARSER_LABEL_HOUSE_NUMBER "house_number" +#define ADDRESS_PARSER_LABEL_PO_BOX "po_box" +#define ADDRESS_PARSER_LABEL_BUILDING "building" +#define ADDRESS_PARSER_LABEL_ENTRANCE "entrance" +#define ADDRESS_PARSER_LABEL_STAIRCASE "staircase" +#define ADDRESS_PARSER_LABEL_LEVEL "level" +#define ADDRESS_PARSER_LABEL_UNIT "unit" #define ADDRESS_PARSER_LABEL_ROAD "road" +#define ADDRESS_PARSER_LABEL_METRO_STATION "metro_station" #define ADDRESS_PARSER_LABEL_SUBURB "suburb" #define ADDRESS_PARSER_LABEL_CITY_DISTRICT "city_district" #define ADDRESS_PARSER_LABEL_CITY "city" @@ -117,6 +124,8 @@ typedef enum { #define ADDRESS_PARSER_LABEL_COUNTRY "country" #define ADDRESS_PARSER_LABEL_WORLD_REGION "world_region" +#define ADDRESS_PARSER_LABEL_WEBSITE "website" +#define ADDRESS_PARSER_LABEL_TELEPHONE "phone" typedef union address_parser_types { uint32_t value; From c78566c2410d7499fb39442a0b93e5d620c335f1 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 24 Dec 2017 01:46:20 -0500 Subject: [PATCH 54/89] [utils] adding cstring_array_extend and string_tree_clear --- src/string_utils.c | 18 ++++++++++++++++++ src/string_utils.h | 4 ++++ 2 files changed, 22 insertions(+) diff --git a/src/string_utils.c b/src/string_utils.c index 7045dd25..45cc1373 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -1110,6 +1110,18 @@ cstring_array *cstring_array_from_strings(char **strings, size_t n) { return array; } +bool cstring_array_extend(cstring_array *array, cstring_array *other) { + if (array == NULL || other == NULL) return false; + size_t n = cstring_array_num_strings(other); + + for (size_t i = 0; i < n; i++) { + char *s_i = cstring_array_get_string(other, i); + cstring_array_add_string(array, s_i); + } + return true; +} + + inline size_t cstring_array_capacity(cstring_array *self) { return self->str->m; } @@ -1318,6 +1330,12 @@ inline void string_tree_finalize_token(string_tree_t *self) { uint32_array_push(self->token_indices, (uint32_t)cstring_array_num_strings(self->strings)); } +void string_tree_clear(string_tree_t *self) { + uint32_array_clear(self->token_indices); + uint32_array_push(self->token_indices, 0); + cstring_array_clear(self->strings); +} + // terminated inline void string_tree_add_string(string_tree_t *self, char *str) { cstring_array_add_string(self->strings, str); diff --git a/src/string_utils.h b/src/string_utils.h index 873a670a..1ddcc626 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -208,6 +208,8 @@ void cstring_array_clear(cstring_array *self); cstring_array *cstring_array_from_char_array(char_array *str); cstring_array *cstring_array_from_strings(char **strings, size_t n); +bool cstring_array_extend(cstring_array *array, cstring_array *other); + // Convert cstring_array to an array of n C strings and destroy the cstring_array char **cstring_array_to_strings(cstring_array *self); @@ -285,6 +287,8 @@ void string_tree_add_string_len(string_tree_t *self, char *str, size_t len); void string_tree_append_string(string_tree_t *self, char *str); void string_tree_append_string_len(string_tree_t *self, char *str, size_t len); +void string_tree_clear(string_tree_t *self); + uint32_t string_tree_num_tokens(string_tree_t *self); uint32_t string_tree_num_strings(string_tree_t *self); From 6c6e5062e0e600b10bbd776f7348263637806ca2 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 24 Dec 2017 02:19:41 -0500 Subject: [PATCH 55/89] [gazetteers] removing stopwords, etc. from numeric type components, adding street type expansions to name components --- src/gazetteer_data.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/gazetteer_data.c b/src/gazetteer_data.c index 444c225e..82d75362 100644 --- a/src/gazetteer_data.c +++ b/src/gazetteer_data.c @@ -10,7 +10,7 @@ gazetteer_t gazetteer_config[] = { {DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, LIBPOSTAL_ADDRESS_ANY}, {DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, LIBPOSTAL_ADDRESS_ANY}, {DICTIONARY_CROSS_STREET, LIBPOSTAL_ADDRESS_STREET}, - {DICTIONARY_DIRECTIONAL, LIBPOSTAL_ADDRESS_ANY}, + {DICTIONARY_DIRECTIONAL, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE}, {DICTIONARY_ELISION, LIBPOSTAL_ADDRESS_ANY}, {DICTIONARY_ENTRANCE, LIBPOSTAL_ADDRESS_ENTRANCE}, {DICTIONARY_GIVEN_NAME, LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_NAME}, @@ -33,10 +33,10 @@ gazetteer_t gazetteer_config[] = { {DICTIONARY_POSTAL_CODE, LIBPOSTAL_ADDRESS_POSTAL_CODE}, {DICTIONARY_QUALIFIER, LIBPOSTAL_ADDRESS_STREET}, {DICTIONARY_STAIRCASE, LIBPOSTAL_ADDRESS_STAIRCASE}, - {DICTIONARY_STOPWORD, LIBPOSTAL_ADDRESS_ANY}, - {DICTIONARY_STREET_TYPE, LIBPOSTAL_ADDRESS_STREET}, + {DICTIONARY_STOPWORD, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM}, + {DICTIONARY_STREET_TYPE, LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_NAME}, {DICTIONARY_SURNAME, LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_NAME}, - {DICTIONARY_SYNONYM, LIBPOSTAL_ADDRESS_ANY}, + {DICTIONARY_SYNONYM, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM}, {DICTIONARY_TOPONYM, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_TOPONYM}, {DICTIONARY_UNIT_NUMBERED, LIBPOSTAL_ADDRESS_UNIT}, {DICTIONARY_UNIT_STANDALONE, LIBPOSTAL_ADDRESS_UNIT}, From acfdb50d7ce2b900a094edd79eec60cfaecc71a6 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 24 Dec 2017 02:43:46 -0500 Subject: [PATCH 56/89] [dedupe] adding near-dupe hashing function, which can be thought of as the blocking function in record linkage or as a form of locally sensitive hashing in general document deduping. The goal is, if two addresses/names are the same, they should share at least one hash. These hashes can also be used as an inverted index (DB, ES, hashtable, etc.). Uses the double metaphone for name words in Latin script (otherwise each individual token, and sequences of two tokens in the case of ideograms for e.g. Chinese, Japanese, Korean, etc.) --- src/near_dupe.c | 942 ++++++++++++++++++++++++++++++++++++++++++++++++ src/near_dupe.h | 14 + 2 files changed, 956 insertions(+) create mode 100644 src/near_dupe.c create mode 100644 src/near_dupe.h diff --git a/src/near_dupe.c b/src/near_dupe.c new file mode 100644 index 00000000..547d0aba --- /dev/null +++ b/src/near_dupe.c @@ -0,0 +1,942 @@ +#include + +#include "log/log.h" + +#include "near_dupe.h" +#include "double_metaphone.h" +#include "expand.h" +#include "features.h" +#include "float_utils.h" +#include "place.h" +#include "scanner.h" +#include "string_utils.h" +#include "tokens.h" +#include "unicode_scripts.h" +#include "unicode_script_types.h" + +#include "geohash/geohash.h" + +#define MAX_GEOHASH_PRECISION 12 + +#define NAME_KEY_PREFIX "n" +#define ADDRESS_KEY_PREFIX "a" +#define UNIT_KEY_PREFIX "u" +#define PO_BOX_KEY_PREFIX "p" +#define HOUSE_NUMBER_KEY_PREFIX "h" +#define STREET_KEY_PREFIX "s" + +#define GEOHASH_KEY_PREFIX "gh" +#define POSTCODE_KEY_PREFIX "pc" +#define CITY_KEY_PREFIX "ct" +#define CONTAINING_BOUNDARY_PREFIX "cb" + +#define NAME_ADDRESS_UNIT_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_ADDRESS_UNIT_CITY_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_ADDRESS_UNIT_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_ADDRESS_UNIT_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_ADDRESS_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_ADDRESS_CITY_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_ADDRESS_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_ADDRESS_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_HOUSE_NUMBER_UNIT_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_HOUSE_NUMBER_UNIT_CITY_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_HOUSE_NUMBER_UNIT_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_HOUSE_NUMBER_UNIT_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_HOUSE_NUMBER_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_HOUSE_NUMBER_CITY_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_HOUSE_NUMBER_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_HOUSE_NUMBER_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_STREET_UNIT_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_STREET_UNIT_CITY_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_STREET_UNIT_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_STREET_UNIT_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_STREET_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_STREET_CITY_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_STREET_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_STREET_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_PO_BOX_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX PO_BOX_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_PO_BOX_CITY_KEY_PREFIX NAME_KEY_PREFIX PO_BOX_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_PO_BOX_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX PO_BOX_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_PO_BOX_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX PO_BOX_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_UNIT_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_UNIT_CITY_KEY_PREFIX NAME_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_UNIT_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_UNIT_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define NAME_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX GEOHASH_KEY_PREFIX +#define NAME_CITY_KEY_PREFIX NAME_KEY_PREFIX CITY_KEY_PREFIX +#define NAME_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define NAME_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define ADDRESS_UNIT_GEOHASH_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX +#define ADDRESS_UNIT_CITY_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX +#define ADDRESS_UNIT_CONTAINING_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define ADDRESS_UNIT_POSTCODE_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define ADDRESS_GEOHASH_KEY_PREFIX ADDRESS_KEY_PREFIX GEOHASH_KEY_PREFIX +#define ADDRESS_CITY_KEY_PREFIX ADDRESS_KEY_PREFIX CITY_KEY_PREFIX +#define ADDRESS_CONTAINING_KEY_PREFIX ADDRESS_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define ADDRESS_POSTCODE_KEY_PREFIX ADDRESS_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define HOUSE_NUMBER_UNIT_GEOHASH_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX +#define HOUSE_NUMBER_UNIT_CITY_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX +#define HOUSE_NUMBER_UNIT_CONTAINING_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define HOUSE_NUMBER_UNIT_POSTCODE_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define HOUSE_NUMBER_GEOHASH_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX GEOHASH_KEY_PREFIX +#define HOUSE_NUMBER_CITY_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX CITY_KEY_PREFIX +#define HOUSE_NUMBER_CONTAINING_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define HOUSE_NUMBER_POSTCODE_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define STREET_GEOHASH_KEY_PREFIX STREET_KEY_PREFIX GEOHASH_KEY_PREFIX +#define STREET_CITY_KEY_PREFIX STREET_KEY_PREFIX CITY_KEY_PREFIX +#define STREET_CONTAINING_KEY_PREFIX STREET_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define STREET_POSTCODE_KEY_PREFIX STREET_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define STREET_UNIT_GEOHASH_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX +#define STREET_UNIT_CITY_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX +#define STREET_UNIT_CONTAINING_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define STREET_UNIT_POSTCODE_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX + +#define PO_BOX_GEOHASH_KEY_PREFIX PO_BOX_KEY_PREFIX GEOHASH_KEY_PREFIX +#define PO_BOX_CITY_KEY_PREFIX PO_BOX_KEY_PREFIX CITY_KEY_PREFIX +#define PO_BOX_CONTAINING_KEY_PREFIX PO_BOX_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX +#define PO_BOX_POSTCODE_KEY_PREFIX PO_BOX_KEY_PREFIX POSTCODE_KEY_PREFIX + +cstring_array *expanded_component_combined(char *input, libpostal_normalize_options_t options, size_t *n) { + size_t num_expansions = 0; + cstring_array *expansions = expand_address(input, options, &num_expansions); + + size_t num_root_expansions = 0; + cstring_array *root_expansions = expand_address_root(input, options, &num_root_expansions); + + if (num_root_expansions == 0) { + cstring_array_destroy(root_expansions); + *n = num_expansions; + return expansions; + } else if (num_expansions == 0) { + cstring_array_destroy(expansions); + *n = num_root_expansions; + return root_expansions; + } else { + khash_t(str_set) *unique_strings = kh_init(str_set); + char *expansion; + khiter_t k; + int ret; + + cstring_array *all_expansions = cstring_array_new(); + + for (size_t i = 0; i < num_expansions; i++) { + expansion = cstring_array_get_string(expansions, i); + k = kh_get(str_set, unique_strings, expansion); + + if (k == kh_end(unique_strings)) { + cstring_array_add_string(all_expansions, expansion); + k = kh_put(str_set, unique_strings, expansion, &ret); + if (ret < 0) { + break; + } + } + } + + for (size_t i = 0; i < num_root_expansions; i++) { + expansion = cstring_array_get_string(root_expansions, i); + k = kh_get(str_set, unique_strings, expansion); + + if (k == kh_end(unique_strings)) { + cstring_array_add_string(all_expansions, expansion); + k = kh_put(str_set, unique_strings, expansion, &ret); + if (ret < 0) { + break; + } + } + } + + *n = cstring_array_num_strings(all_expansions); + + kh_destroy(str_set, unique_strings); + cstring_array_destroy(root_expansions); + cstring_array_destroy(expansions); + + return all_expansions; + } +} + +inline cstring_array *expanded_component_root_with_fallback(char *input, libpostal_normalize_options_t options, size_t *n) { + cstring_array *root_expansions = expand_address_root(input, options, n); + if (*n > 0) { + return root_expansions; + } else { + cstring_array_destroy(root_expansions); + *n = 0; + return expand_address(input, options, n); + } +} + + +static cstring_array *geohash_and_neighbors(double latitude, double longitude, size_t geohash_precision) { + if (geohash_precision == 0) return NULL; + + if (geohash_precision > MAX_GEOHASH_PRECISION) geohash_precision = MAX_GEOHASH_PRECISION; + + char geohash[geohash_precision + 1]; + if (geohash_encode(latitude, longitude, geohash, geohash_precision) != GEOHASH_OK) { + return NULL; + } + + size_t neighbors_size = geohash_precision * 8 + 1; + char neighbors[neighbors_size]; + + int num_strings = 0; + + if (geohash_neighbors(geohash, neighbors, neighbors_size, &num_strings) == GEOHASH_OK && num_strings == 8) { + cstring_array *strings = cstring_array_new_size(9 * geohash_precision + 1); + cstring_array_add_string(strings, geohash); + + for (int i = 0; i < num_strings; i++) { + char *neighbor = neighbors + geohash_precision * i; + cstring_array_add_string(strings, neighbor); + } + return strings; + } + + return NULL; +} + + +cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normalize_options) { + normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_ANY; + size_t num_expansions = 0; + cstring_array *name_expansions = expanded_component_root_with_fallback(name, normalize_options, &num_expansions); + if (num_expansions == 0) { + cstring_array_destroy(name_expansions); + return NULL; + } + + size_t len = strlen(name); + + char_array *token_string_array = char_array_new_size(len); + cstring_array *strings = cstring_array_new_size(len); + token_array *token_array = token_array_new(); + + char_array *combined_words_no_whitespace = char_array_new(); + + bool keep_whitespace = false; + + khash_t(str_set) *unique_strings = kh_init(str_set); + khiter_t k; + int ret = 0; + + for (size_t i = 0; i < num_expansions; i++) { + char *expansion = cstring_array_get_string(name_expansions, i); + log_debug("expansion = %s\n", expansion); + tokenize_add_tokens(token_array, expansion, strlen(expansion), keep_whitespace); + size_t num_tokens = token_array->n; + token_t *tokens = token_array->a; + token_t prev_token; + char *token_str; + for (size_t j = 0; j < num_tokens; j++) { + token_t token = tokens[j]; + bool ideogram = is_ideographic(token.type); + + string_script_t token_script = get_string_script(expansion + token.offset, token.len); + bool is_latin = token_script.len == token.len && token_script.script == SCRIPT_LATIN; + + char_array_clear(token_string_array); + // For ideograms, since the "words" are characters, we use shingles of two characters + if (ideogram && j > 0 && is_ideographic(prev_token.type)) { + log_debug("cat ideogram\n"); + char_array_cat_len(token_string_array, expansion + prev_token.offset, prev_token.len); + } + + // For Latin script, add double metaphone of the words + if (is_latin && !is_numeric_token(token.type) && !ideogram && !is_punctuation(token.type)) { + char_array_clear(token_string_array); + char_array_cat_len(token_string_array, expansion + token.offset, token.len); + token_str = char_array_get_string(token_string_array); + + log_debug("token_str = %s\n", token_str); + + double_metaphone_codes_t *dm_codes = double_metaphone(token_str); + if (dm_codes == NULL) { + prev_token = token; + continue; + } + char *dm_primary = dm_codes->primary; + char *dm_secondary = dm_codes->secondary; + + if (!string_equals(dm_primary, "")) { + + k = kh_get(str_set, unique_strings, dm_primary); + + if (k == kh_end(unique_strings)) { + log_debug("adding dm_primary = %s\n", dm_primary); + cstring_array_add_string(strings, dm_primary); + k = kh_put(str_set, unique_strings, strdup(dm_primary), &ret); + if (ret < 0) { + break; + } + } + + if (!string_equals(dm_secondary, dm_primary)) { + + k = kh_get(str_set, unique_strings, dm_secondary); + + if (k == kh_end(unique_strings)) { + log_debug("adding dm_secondary = %s\n", dm_secondary); + cstring_array_add_string(strings, dm_secondary); + k = kh_put(str_set, unique_strings, strdup(dm_secondary), &ret); + if (ret < 0) { + break; + } + } + } + } + double_metaphone_codes_destroy(dm_codes); + // For non-Latin words (Arabic, Cyrllic, etc.) just add the word + // For ideograms, we do two-character shingles, so only add the first character if the string has one token + } else if (!ideogram || j > 0 || num_tokens == 1) { + char_array_cat_len(token_string_array, expansion + token.offset, token.len); + token_str = char_array_get_string(token_string_array); + log_debug("token_str = %s\n", token_str); + k = kh_get(str_set, unique_strings, token_str); + + if (k == kh_end(unique_strings)) { + cstring_array_add_string(strings, token_str); + k = kh_put(str_set, unique_strings, strdup(token_str), &ret); + if (ret < 0) { + break; + } + } + } + + prev_token = token; + } + + token_array_clear(token_array); + } + + char_array_destroy(token_string_array); + token_array_destroy(token_array); + char_array_destroy(combined_words_no_whitespace); + + const char *key; + + kh_foreach_key(unique_strings, key, { + free((char *)key); + }); + kh_destroy(str_set, unique_strings); + + return strings; +} + + +static inline void add_string_arrays_to_tree(string_tree_t *tree, size_t n, va_list args) { + for (size_t i = 0; i < n; i++) { + cstring_array *a = va_arg(args, cstring_array *); + size_t num_strings = cstring_array_num_strings(a); + if (num_strings == 0) continue; + for (size_t j = 0; j < num_strings; j++) { + char *str = cstring_array_get_string(a, j); + string_tree_add_string(tree, str); + } + string_tree_finalize_token(tree); + } + va_end(args); +} + +static inline void add_hashes_from_tree(cstring_array *near_dupe_hashes, char *prefix, string_tree_t *tree) { + string_tree_iterator_t *iter = string_tree_iterator_new(tree); + if (iter->num_tokens > 0) { + log_debug("iter->num_tokens = %zu\n", iter->num_tokens); + + for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) { + + cstring_array_start_token(near_dupe_hashes); + cstring_array_append_string(near_dupe_hashes, prefix); + + char *str; + string_tree_iterator_foreach_token(iter, str, { + cstring_array_append_string(near_dupe_hashes, "|"); + cstring_array_append_string(near_dupe_hashes, str); + //log_debug("str=%s\n", str); + }); + + cstring_array_terminate(near_dupe_hashes); + } + } + + string_tree_iterator_destroy(iter); +} + + +static inline void add_string_hash_permutations(cstring_array *near_dupe_hashes, char *prefix, string_tree_t *tree, size_t n, ...) { + string_tree_clear(tree); + + log_debug("prefix=%s\n", prefix); + + va_list args; + va_start(args, n); + add_string_arrays_to_tree(tree, n, args); + va_end(args); + + log_debug("string_tree_num_strings(tree)=%zu\n", string_tree_num_strings(tree)); + + add_hashes_from_tree(near_dupe_hashes, prefix, tree); +} + + +cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages) { + place_t *place = place_from_components(num_components, labels, values); + log_debug("created place\n"); + if (place == NULL) return NULL; + + size_t n = 0; + + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + + language_classifier_response_t *lang_response = NULL; + + if (num_languages == 0) { + lang_response = place_languages(num_components, labels, values); + + if (lang_response != NULL) { + log_debug("got %zu place languages\n", lang_response->num_languages); + normalize_options.num_languages = lang_response->num_languages; + normalize_options.languages = lang_response->languages; + } + } else { + normalize_options.num_languages = languages; + normalize_options.languages = languages; + } + + string_tree_t *tree = string_tree_new(); + + cstring_array *name_expansions = NULL; + size_t num_name_expansions = 0; + if (place->name != NULL && options.with_name) { + log_debug("Doing name expansions for %s\n", place->name); + name_expansions = name_word_hashes(place->name, normalize_options); + if (name_expansions != NULL) { + num_name_expansions = cstring_array_num_strings(name_expansions); + log_debug("Got %zu name expansions\n", num_name_expansions); + } + } + + + cstring_array *street_expansions = NULL; + size_t num_street_expansions = 0; + if (place->street != NULL) { + log_debug("Doing street expansions for %s\n", place->street); + normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY; + street_expansions = expanded_component_combined(place->street, normalize_options, &num_street_expansions); + log_debug("Got %zu street expansions\n", num_street_expansions); + } + + cstring_array *house_number_expansions = NULL; + size_t num_house_number_expansions = 0; + if (place->house_number != NULL) { + log_debug("Doing house number expansions for %s\n", place->house_number); + normalize_options.address_components = LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY; + house_number_expansions = expand_address_root(place->house_number, normalize_options, &num_house_number_expansions); + log_debug("Got %zu house number expansions\n", num_house_number_expansions); + } + + cstring_array *unit_expansions = NULL; + size_t num_unit_expansions = 0; + if (place->unit != NULL && options.with_unit) { + log_debug("Doing unit expansions for %s\n", place->unit); + normalize_options.address_components = LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY; + unit_expansions = expand_address_root(place->unit, normalize_options, &num_unit_expansions); + log_debug("Got %zu unit expansions\n", num_unit_expansions); + } + + cstring_array *building_expansions = NULL; + size_t num_building_expansions = 0; + if (place->building != NULL && options.with_unit) { + normalize_options.address_components = LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY; + building_expansions = expand_address_root(place->building, normalize_options, &num_building_expansions); + } + + cstring_array *level_expansions = NULL; + size_t num_level_expansions = 0; + if (place->level != NULL && options.with_unit) { + normalize_options.address_components = LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY; + level_expansions = expand_address_root(place->level, normalize_options, &num_level_expansions); + } + + cstring_array *po_box_expansions = NULL; + size_t num_po_box_expansions = 0; + if (place->po_box != NULL) { + normalize_options.address_components = LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_ANY; + po_box_expansions = expand_address_root(place->po_box, normalize_options, &num_po_box_expansions); + } + + cstring_array *place_expansions = NULL; + cstring_array *containing_expansions = NULL; + + if (options.with_city_or_equivalent) { + normalize_options.address_components = LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_ANY; + + if (place->city != NULL) { + size_t num_city_expansions = 0; + cstring_array *city_expansions = expand_address_root(place->city, normalize_options, &num_city_expansions); + if (place_expansions == NULL) { + place_expansions = city_expansions; + } else if (city_expansions != NULL && num_city_expansions > 0) { + cstring_array_extend(place_expansions, city_expansions); + cstring_array_destroy(city_expansions); + } + + } + + if (place->city_district != NULL) { + size_t num_city_district_expansions = 0; + cstring_array *city_district_expansions = expand_address_root(place->city_district, normalize_options, &num_city_district_expansions); + if (place_expansions == NULL) { + place_expansions = city_district_expansions; + } else if (city_district_expansions != NULL && num_city_district_expansions > 0) { + cstring_array_extend(place_expansions, city_district_expansions); + cstring_array_destroy(city_district_expansions); + } + } + + if (place->suburb != NULL) { + size_t num_suburb_expansions = 0; + cstring_array *suburb_expansions = expand_address_root(place->suburb, normalize_options, &num_suburb_expansions); + if (place_expansions == NULL) { + place_expansions = suburb_expansions; + } else if (suburb_expansions != NULL && num_suburb_expansions > 0) { + cstring_array_extend(place_expansions, suburb_expansions); + cstring_array_destroy(suburb_expansions); + } + } + + + if (place->island != NULL) { + size_t num_island_expansions = 0; + cstring_array *island_expansions = expand_address_root(place->island, normalize_options, &num_island_expansions); + if (place_expansions == NULL) { + place_expansions = island_expansions; + } else if (island_expansions != NULL && num_island_expansions > 0) { + cstring_array_extend(place_expansions, island_expansions); + cstring_array_destroy(island_expansions); + } + } + + if (place->state_district != NULL) { + size_t num_state_district_expansions = 0; + cstring_array *state_district_expansions = expand_address_root(place->state_district, normalize_options, &num_state_district_expansions); + if (containing_expansions == NULL) { + containing_expansions = state_district_expansions; + } else if (state_district_expansions != NULL && num_state_district_expansions > 0) { + cstring_array_extend(containing_expansions, state_district_expansions); + cstring_array_destroy(state_district_expansions); + } + } + } + + cstring_array *postal_code_expansions = NULL; + size_t num_postal_code_expansions = 0; + if (options.with_postal_code && place->postal_code != NULL) { + normalize_options.address_components = LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_ANY; + postal_code_expansions = expand_address_root(place->postal_code, normalize_options, &num_postal_code_expansions); + } + + cstring_array *geohash_expansions = NULL; + if (options.with_latlon && !(double_equals(options.latitude, 0.0) && double_equals(options.longitude, 0.0))) { + geohash_expansions = geohash_and_neighbors(options.latitude, options.longitude, options.geohash_precision); + } + + size_t num_geohash_expansions = geohash_expansions != NULL ? cstring_array_num_strings(geohash_expansions) : 0; + if (num_geohash_expansions == 0 && num_postal_code_expansions == 0 && place_expansions == NULL && containing_expansions == NULL) { + return NULL; + } + + bool added = false; + + num_name_expansions = name_expansions != NULL ? cstring_array_num_strings(name_expansions) : 0; + num_street_expansions = street_expansions != NULL ? cstring_array_num_strings(street_expansions) : 0; + num_house_number_expansions = house_number_expansions != NULL ? cstring_array_num_strings(house_number_expansions) : 0; + num_po_box_expansions = po_box_expansions != NULL ? cstring_array_num_strings(po_box_expansions) : 0; + num_unit_expansions = unit_expansions != NULL ? cstring_array_num_strings(unit_expansions) : 0; + num_building_expansions = building_expansions != NULL ? cstring_array_num_strings(building_expansions) : 0; + num_level_expansions = level_expansions != NULL ? cstring_array_num_strings(level_expansions) : 0; + + bool have_unit = num_unit_expansions > 0 || num_building_expansions > 0 || num_level_expansions > 0; + cstring_array *unit_or_equivalent_expansions = NULL; + if (num_unit_expansions > 0) { + unit_or_equivalent_expansions = unit_expansions; + } else if (num_building_expansions > 0) { + unit_or_equivalent_expansions = building_expansions; + } else if (num_level_expansions > 0) { + unit_or_equivalent_expansions = level_expansions; + } + + cstring_array *near_dupe_hashes = cstring_array_new(); + + if (num_name_expansions > 0) { + if (num_street_expansions > 0 && num_house_number_expansions > 0 && options.name_and_address_keys) { + // Have street, house number, and unit + if (have_unit) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_UNIT_GEOHASH_KEY_PREFIX, tree, 5, name_expansions, street_expansions, house_number_expansions, unit_or_equivalent_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_UNIT_CITY_KEY_PREFIX, tree, 5, name_expansions, street_expansions, house_number_expansions, unit_or_equivalent_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_UNIT_CONTAINING_KEY_PREFIX, tree, 5, name_expansions, street_expansions, house_number_expansions, unit_or_equivalent_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_UNIT_POSTCODE_KEY_PREFIX, tree, 5, name_expansions, street_expansions, house_number_expansions, unit_or_equivalent_expansions, postal_code_expansions); + } + // Have street and house number, no unit + } else { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_GEOHASH_KEY_PREFIX, tree, 4, name_expansions, street_expansions, house_number_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_CITY_KEY_PREFIX, tree, 4, name_expansions, street_expansions, house_number_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_CONTAINING_KEY_PREFIX, tree, 4, name_expansions, street_expansions, house_number_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_POSTCODE_KEY_PREFIX, tree, 4, name_expansions, street_expansions, house_number_expansions, postal_code_expansions); + } + } + // Japan, other places with no street names + } else if (num_house_number_expansions > 0 && options.name_and_address_keys) { + // House number and unit + if (have_unit) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_UNIT_GEOHASH_KEY_PREFIX, tree, 4, name_expansions, house_number_expansions, unit_or_equivalent_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_UNIT_CITY_KEY_PREFIX, tree, 4, name_expansions, house_number_expansions, unit_or_equivalent_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_UNIT_CONTAINING_KEY_PREFIX, tree, 4, name_expansions, house_number_expansions, unit_or_equivalent_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_UNIT_POSTCODE_KEY_PREFIX, tree, 4, name_expansions, house_number_expansions, unit_or_equivalent_expansions, postal_code_expansions); + } + // House number, no unit + } else { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_GEOHASH_KEY_PREFIX, tree, 3, name_expansions, house_number_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_CITY_KEY_PREFIX, tree, 3, name_expansions, house_number_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_CONTAINING_KEY_PREFIX, tree, 3, name_expansions, house_number_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_POSTCODE_KEY_PREFIX, tree, 3, name_expansions, house_number_expansions, postal_code_expansions); + } + } + // Addresses in India, UK, Ireland, many university addresses, etc. may have house name + street with no house numbers + } else if (num_street_expansions > 0 && options.name_and_address_keys) { + // Have street, house number, and unit + if (have_unit) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_UNIT_GEOHASH_KEY_PREFIX, tree, 4, name_expansions, street_expansions, unit_or_equivalent_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_UNIT_CITY_KEY_PREFIX, tree, 4, name_expansions, street_expansions, unit_or_equivalent_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_UNIT_CONTAINING_KEY_PREFIX, tree, 4, name_expansions, street_expansions, unit_or_equivalent_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_UNIT_POSTCODE_KEY_PREFIX, tree, 4, name_expansions, street_expansions, unit_or_equivalent_expansions, postal_code_expansions); + } + // Have street and house number, no unit + } else { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_GEOHASH_KEY_PREFIX, tree, 3, name_expansions, street_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_CITY_KEY_PREFIX, tree, 3, name_expansions, street_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_CONTAINING_KEY_PREFIX, tree, 3, name_expansions, street_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_STREET_POSTCODE_KEY_PREFIX, tree, 3, name_expansions, street_expansions, postal_code_expansions); + } + } + // PO Box only addresses, mailing addresses + } else if (num_po_box_expansions > 0 && options.name_and_address_keys) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_PO_BOX_GEOHASH_KEY_PREFIX, tree, 3, name_expansions, po_box_expansions, geohash_expansions); + } + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_PO_BOX_CITY_KEY_PREFIX, tree, 3, name_expansions, po_box_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_PO_BOX_CONTAINING_KEY_PREFIX, tree, 3, name_expansions, po_box_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_PO_BOX_POSTCODE_KEY_PREFIX, tree, 3, name_expansions, po_box_expansions, postal_code_expansions); + } + // Only name + } else if (options.name_only_keys) { + // Have name and unit, some university addresses + if (have_unit) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_UNIT_GEOHASH_KEY_PREFIX, tree, 3, name_expansions, unit_or_equivalent_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_UNIT_CITY_KEY_PREFIX, tree, 3, name_expansions, unit_or_equivalent_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_UNIT_CONTAINING_KEY_PREFIX, tree, 3, name_expansions, unit_or_equivalent_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_UNIT_POSTCODE_KEY_PREFIX, tree, 3, name_expansions, unit_or_equivalent_expansions, postal_code_expansions); + } + // Have name and geo only + } else { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_GEOHASH_KEY_PREFIX, tree, 2, name_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_CITY_KEY_PREFIX, tree, 2, name_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_CONTAINING_KEY_PREFIX, tree, 2, name_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, NAME_POSTCODE_KEY_PREFIX, tree, 2, name_expansions, postal_code_expansions); + } + } + } + } + + if (options.address_only_keys) { + if (num_street_expansions > 0 && num_house_number_expansions > 0) { + // Have street, house number, and unit + if (have_unit) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_UNIT_GEOHASH_KEY_PREFIX, tree, 4, street_expansions, house_number_expansions, unit_or_equivalent_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_UNIT_CITY_KEY_PREFIX, tree, 4, street_expansions, house_number_expansions, unit_or_equivalent_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_UNIT_CONTAINING_KEY_PREFIX, tree, 4, street_expansions, house_number_expansions, unit_or_equivalent_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_UNIT_POSTCODE_KEY_PREFIX, tree, 4, street_expansions, house_number_expansions, unit_or_equivalent_expansions, postal_code_expansions); + } + // Have street and house number, no unit + } else { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_GEOHASH_KEY_PREFIX, tree, 3, street_expansions, house_number_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_CITY_KEY_PREFIX, tree, 3, street_expansions, house_number_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_CONTAINING_KEY_PREFIX, tree, 3, street_expansions, house_number_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, ADDRESS_POSTCODE_KEY_PREFIX, tree, 3, street_expansions, house_number_expansions, postal_code_expansions); + } + } + // Japan, other places with no street names + } else if (num_house_number_expansions > 0) { + // House number and unit + if (have_unit) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_UNIT_GEOHASH_KEY_PREFIX, tree, 3, house_number_expansions, unit_or_equivalent_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_UNIT_CITY_KEY_PREFIX, tree, 3, house_number_expansions, unit_or_equivalent_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_UNIT_CONTAINING_KEY_PREFIX, tree, 3, house_number_expansions, unit_or_equivalent_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_UNIT_POSTCODE_KEY_PREFIX, tree, 3, house_number_expansions, unit_or_equivalent_expansions, postal_code_expansions); + } + // House number, no unit + } else { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_GEOHASH_KEY_PREFIX, tree, 2, house_number_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_CITY_KEY_PREFIX, tree, 2, house_number_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_CONTAINING_KEY_PREFIX, tree, 2, house_number_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_POSTCODE_KEY_PREFIX, tree, 2, house_number_expansions, postal_code_expansions); + } + } + // Addresses in India, UK, Ireland, many university addresses, etc. may have house name + street with no house numbers + } else if (num_street_expansions > 0) { + // Have street, house number, and unit + if (have_unit) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_UNIT_GEOHASH_KEY_PREFIX, tree, 3, street_expansions, unit_or_equivalent_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_UNIT_CITY_KEY_PREFIX, tree, 3, street_expansions, unit_or_equivalent_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_UNIT_CONTAINING_KEY_PREFIX, tree, 3, street_expansions, unit_or_equivalent_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_UNIT_POSTCODE_KEY_PREFIX, tree, 3, street_expansions, unit_or_equivalent_expansions, postal_code_expansions); + } + // Have street and house number, no unit + } else { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_GEOHASH_KEY_PREFIX, tree, 2, street_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_CITY_KEY_PREFIX, tree, 2, street_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_CONTAINING_KEY_PREFIX, tree, 2, street_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, STREET_POSTCODE_KEY_PREFIX, tree, 2, street_expansions, postal_code_expansions); + } + } + // PO Box only addresses, mailing addresses + } else if (num_po_box_expansions > 0) { + if (geohash_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, PO_BOX_GEOHASH_KEY_PREFIX, tree, 2, po_box_expansions, geohash_expansions); + } + + if (place_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, PO_BOX_CITY_KEY_PREFIX, tree, 2, po_box_expansions, place_expansions); + } + + if (containing_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, PO_BOX_CONTAINING_KEY_PREFIX, tree, 2, po_box_expansions, containing_expansions); + } + + if (postal_code_expansions != NULL) { + add_string_hash_permutations(near_dupe_hashes, PO_BOX_POSTCODE_KEY_PREFIX, tree, 2, po_box_expansions, postal_code_expansions); + } + } + + } + + if (tree != NULL) { + string_tree_destroy(tree); + } + + if (name_expansions != NULL) { + cstring_array_destroy(name_expansions); + } + + if (street_expansions != NULL) { + cstring_array_destroy(street_expansions); + } + + if (house_number_expansions != NULL) { + cstring_array_destroy(house_number_expansions); + } + + if (unit_expansions != NULL) { + cstring_array_destroy(unit_expansions); + } + + if (building_expansions != NULL) { + cstring_array_destroy(building_expansions); + } + + if (level_expansions != NULL) { + cstring_array_destroy(level_expansions); + } + + if (po_box_expansions != NULL) { + cstring_array_destroy(po_box_expansions); + } + + if (place_expansions != NULL) { + cstring_array_destroy(place_expansions); + } + + + if (containing_expansions != NULL) { + cstring_array_destroy(containing_expansions); + } + + if (postal_code_expansions != NULL) { + cstring_array_destroy(postal_code_expansions); + } + + if (geohash_expansions != NULL) { + cstring_array_destroy(geohash_expansions); + } + + if (lang_response != NULL) { + language_classifier_response_destroy(lang_response); + } + + return near_dupe_hashes; +} + +inline cstring_array *near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options) { + return near_dupe_hashes_languages(num_components, labels, values, options, 0, NULL); +} diff --git a/src/near_dupe.h b/src/near_dupe.h new file mode 100644 index 00000000..9e3d33f8 --- /dev/null +++ b/src/near_dupe.h @@ -0,0 +1,14 @@ + +#ifndef NEAR_DUPE_H +#define NEAR_DUPE_H + +#include +#include + +#include "libpostal.h" +#include "string_utils.h" + +cstring_array *near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options); +cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages); + +#endif \ No newline at end of file From 8b75c44026aa8eeb7a72b9ebb21961b86469fec2 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 24 Dec 2017 12:41:44 -0500 Subject: [PATCH 57/89] [dedupe] adding a test program for near dupe hashing that simply prints out the results. Automated tests in the works --- src/near_dupe_test.c | 122 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 src/near_dupe_test.c diff --git a/src/near_dupe_test.c b/src/near_dupe_test.c new file mode 100644 index 00000000..18155dea --- /dev/null +++ b/src/near_dupe_test.c @@ -0,0 +1,122 @@ +#include +#include + +#include "libpostal.h" +#include "string_utils.h" + +int main(int argc, char **argv) { + if (argc < 3) { + printf("Usage: ./test_near_dupe label value [...]\n"); + exit(EXIT_FAILURE); + } + + if (!libpostal_setup() || !libpostal_setup_language_classifier()) { + exit(EXIT_FAILURE); + } + + libpostal_near_dupe_hash_options_t options = libpostal_near_dupe_hash_default_options(); + + cstring_array *labels_array = cstring_array_new(); + cstring_array *values_array = cstring_array_new(); + cstring_array *languages_array = NULL; + + bool label = true; + bool next_is_latitude = false; + bool next_is_longitude = false; + bool next_is_geohash_precision = false; + bool have_latitude = false; + bool have_longitude = false; + bool next_is_language = false; + double longitude = 0.0; + double latitude = 0.0; + + + for (size_t i = 1; i < argc; i++) { + char *arg = argv[i]; + + if (string_equals(arg, "--with-unit")) { + options.with_unit = true; + } else if (string_equals(arg, "--latitude")) { + next_is_latitude = true; + } else if (string_equals(arg, "--longitude")) { + next_is_longitude = true; + } else if (string_equals(arg, "--geohash-precision")) { + next_is_geohash_precision = true; + } else if (string_equals(arg, "--name-only-keys")) { + options.name_only_keys = true; + } else if (string_equals(arg, "--address-only-keys")) { + options.address_only_keys = true; + } else if (string_equals(arg, "--language")) { + next_is_language = true; + } else if (next_is_latitude) { + sscanf(arg, "%lf", &latitude); + next_is_latitude = false; + have_latitude = true; + } else if (next_is_longitude) { + sscanf(arg, "%lf", &longitude); + next_is_longitude = false; + have_longitude = true; + } else if (next_is_geohash_precision) { + size_t geohash_precision = 0; + sscanf(arg, "%zu", &geohash_precision); + options.geohash_precision = geohash_precision; + next_is_geohash_precision = false; + } else if (next_is_language) { + if (languages_array == NULL) { + languages_array = cstring_array_new(); + } + cstring_array_add_string(languages_array, arg); + } else if (label) { + cstring_array_add_string(labels_array, arg); + label = false; + } else { + cstring_array_add_string(values_array, arg); + label = true; + } + } + + if (have_latitude && have_longitude) { + options.with_latlon = true; + options.latitude = latitude; + options.longitude = longitude; + } + + size_t num_languages = 0; + char **languages = NULL; + if (languages_array != NULL) { + num_languages = cstring_array_num_strings(languages_array); + languages = cstring_array_to_strings(languages_array); + } + + + size_t num_components = cstring_array_num_strings(labels_array); + if (num_components != cstring_array_num_strings(values_array)) { + cstring_array_destroy(labels_array); + cstring_array_destroy(values_array); + printf("Must have same number of labels and values\n"); + exit(EXIT_FAILURE); + } + + char **labels = cstring_array_to_strings(labels_array); + char **values = cstring_array_to_strings(values_array); + + size_t num_near_dupe_hashes = 0; + char **near_dupe_hashes = libpostal_near_dupe_hashes_languages(num_components, labels, values, options, num_languages, languages, &num_near_dupe_hashes); + + for (size_t i = 0; i < num_near_dupe_hashes; i++) { + char *near_dupe_hash = near_dupe_hashes[i]; + printf("%s\n", near_dupe_hash); + } + + libpostal_expansion_array_destroy(near_dupe_hashes, num_near_dupe_hashes); + libpostal_expansion_array_destroy(labels, num_components); + libpostal_expansion_array_destroy(values, num_components); + + if (languages != NULL) { + libpostal_expansion_array_destroy(languages, num_languages); + } + + libpostal_teardown(); + libpostal_teardown_language_classifier(); + +} From f3a626463a77f5f506790c08584af7b350c753b3 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 24 Dec 2017 12:43:28 -0500 Subject: [PATCH 58/89] [api] adding API functions for near dupe hashes to the public header --- src/libpostal.c | 42 ++++++++++++++++++++++++++++++++++++++++-- src/libpostal.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/src/libpostal.c b/src/libpostal.c index f12d4898..2825bab4 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -11,6 +11,7 @@ #include "expand.h" #include "language_classifier.h" +#include "near_dupe.h" #include "normalize.h" #include "scanner.h" #include "string_utils.h" @@ -45,17 +46,54 @@ libpostal_normalize_options_t libpostal_get_default_options(void) { } char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { - return expand_address(input, options, n); + cstring_array *strings = expand_address(input, options, n); + return cstring_array_to_strings(strings); } char **libpostal_expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) { - return expand_address_root(input, options, n); + cstring_array *strings = expand_address_root(input, options, n); + return cstring_array_to_strings(strings); } void libpostal_expansion_array_destroy(char **expansions, size_t n) { expansion_array_destroy(expansions, n); } +#define DEFAULT_NEAR_DUPE_GEOHASH_PRECISION 6 + +static libpostal_near_dupe_hash_options_t LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS = { + .with_name = true, + .with_address = true, + .with_unit = false, + .with_city_or_equivalent = true, + .with_small_containing_boundaries = true, + .with_postal_code = true, + .with_latlon = false, + .latitude = 0.0, + .longitude = 0.0, + .geohash_precision = DEFAULT_NEAR_DUPE_GEOHASH_PRECISION, + .name_and_address_keys = true, + .name_only_keys = false, + .address_only_keys = false +}; + +libpostal_near_dupe_hash_options_t libpostal_near_dupe_hash_default_options(void) { + return LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS; +} + +char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes) { + cstring_array *strings = near_dupe_hashes(num_components, labels, values, options); + *num_hashes = cstring_array_num_strings(strings); + return cstring_array_to_strings(strings); +} + + +char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes) { + cstring_array *strings = near_dupe_hashes_languages(num_components, labels, values, options, num_languages, languages); + *num_hashes = cstring_array_num_strings(strings); + return cstring_array_to_strings(strings); +} + void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) { if (self == NULL) return; diff --git a/src/libpostal.h b/src/libpostal.h index f088db72..dc1c3c4e 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -152,6 +152,8 @@ typedef struct libpostal_address_parser_response { char **labels; } libpostal_address_parser_response_t; +typedef libpostal_address_parser_response_t libpostal_parsed_address_components_t; + typedef struct libpostal_address_parser_options { char *language; char *country; @@ -165,6 +167,32 @@ LIBPOSTAL_EXPORT libpostal_address_parser_response_t *libpostal_parse_address(ch LIBPOSTAL_EXPORT bool libpostal_parser_print_features(bool print_features); + +/* +Deduping +*/ + +typedef struct libpostal_near_dupe_hash_options { + bool with_name; + bool with_address; + bool with_unit; + bool with_city_or_equivalent; + bool with_small_containing_boundaries; + bool with_postal_code; + bool with_latlon; + double latitude; + double longitude; + size_t geohash_precision; + bool name_and_address_keys; + bool name_only_keys; + bool address_only_keys; +} libpostal_near_dupe_hash_options_t; + + +LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_near_dupe_hash_default_options(void); +LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes); +LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes); + // Setup/teardown methods LIBPOSTAL_EXPORT bool libpostal_setup(void); From acbebc9ecfd388efbfd554f39a191d9f90a7c172 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 24 Dec 2017 12:44:10 -0500 Subject: [PATCH 59/89] [build] adding new source files for near dupe hashing and the command-line program to the Makefile --- src/Makefile.am | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index e76a3a1e..07af51d3 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -12,7 +12,7 @@ DEFAULT_INCLUDES = -I.. -I/usr/local/include CFLAGS = lib_LTLIBRARIES = libpostal.la -libpostal_la_SOURCES = strndup.c libpostal.c expand.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c +libpostal_la_SOURCES = strndup.c libpostal.c expand.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c place.c near_dupe.c double_metaphone.c geohash/geohash.c libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS) libpostal_la_CFLAGS = $(CFLAGS_O2) -D LIBPOSTAL_EXPORTS libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@ -no-undefined @@ -26,7 +26,7 @@ noinst_LTLIBRARIES = libscanner.la libscanner_la_SOURCES = klib/drand48.c scanner.c libscanner_la_CFLAGS = $(CFLAGS_O0) -D LIBPOSTAL_EXPORTS $(CFLAGS_SCANNER_EXTRA) -noinst_PROGRAMS = libpostal bench address_parser address_parser_train address_parser_test build_address_dictionary build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test +noinst_PROGRAMS = libpostal bench address_parser address_parser_train address_parser_test build_address_dictionary build_numex_table build_trans_table address_parser_train address_parser_test language_classifier_train language_classifier language_classifier_test near_dupe_test libpostal_SOURCES = strndup.c main.c json_encode.c file_utils.c string_utils.c utf8proc/utf8proc.c libpostal_LDADD = libpostal.la @@ -38,6 +38,11 @@ address_parser_SOURCES = strndup.c address_parser_cli.c json_encode.c linenoise/ address_parser_LDADD = libpostal.la $(CBLAS_LIBS) address_parser_CFLAGS = $(CFLAGS_O3) +near_dupe_test_SOURCES = strndup.c near_dupe_test.c string_utils.c utf8proc/utf8proc.c +near_dupe_test_LDADD = libpostal.la +near_dupe_test_CFLAGS = $(CFLAGS_O3) + + build_address_dictionary_SOURCES = strndup.c address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c build_address_dictionary_CFLAGS = $(CFLAGS_O3) build_numex_table_SOURCES = strndup.c numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c From a3f39be0d47763bb44108af8855c29b8e3c441b3 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 24 Dec 2017 23:51:35 -0500 Subject: [PATCH 60/89] [fix] reverting gazetteer changes as it would affect the parser features as well and require retraining --- src/gazetteer_data.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/gazetteer_data.c b/src/gazetteer_data.c index 82d75362..444c225e 100644 --- a/src/gazetteer_data.c +++ b/src/gazetteer_data.c @@ -10,7 +10,7 @@ gazetteer_t gazetteer_config[] = { {DICTIONARY_CONCATENATED_SUFFIX_INSEPARABLE, LIBPOSTAL_ADDRESS_ANY}, {DICTIONARY_CONCATENATED_SUFFIX_SEPARABLE, LIBPOSTAL_ADDRESS_ANY}, {DICTIONARY_CROSS_STREET, LIBPOSTAL_ADDRESS_STREET}, - {DICTIONARY_DIRECTIONAL, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE}, + {DICTIONARY_DIRECTIONAL, LIBPOSTAL_ADDRESS_ANY}, {DICTIONARY_ELISION, LIBPOSTAL_ADDRESS_ANY}, {DICTIONARY_ENTRANCE, LIBPOSTAL_ADDRESS_ENTRANCE}, {DICTIONARY_GIVEN_NAME, LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_NAME}, @@ -33,10 +33,10 @@ gazetteer_t gazetteer_config[] = { {DICTIONARY_POSTAL_CODE, LIBPOSTAL_ADDRESS_POSTAL_CODE}, {DICTIONARY_QUALIFIER, LIBPOSTAL_ADDRESS_STREET}, {DICTIONARY_STAIRCASE, LIBPOSTAL_ADDRESS_STAIRCASE}, - {DICTIONARY_STOPWORD, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM}, - {DICTIONARY_STREET_TYPE, LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_NAME}, + {DICTIONARY_STOPWORD, LIBPOSTAL_ADDRESS_ANY}, + {DICTIONARY_STREET_TYPE, LIBPOSTAL_ADDRESS_STREET}, {DICTIONARY_SURNAME, LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_NAME}, - {DICTIONARY_SYNONYM, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM}, + {DICTIONARY_SYNONYM, LIBPOSTAL_ADDRESS_ANY}, {DICTIONARY_TOPONYM, LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_TOPONYM}, {DICTIONARY_UNIT_NUMBERED, LIBPOSTAL_ADDRESS_UNIT}, {DICTIONARY_UNIT_STANDALONE, LIBPOSTAL_ADDRESS_UNIT}, From b4ce042f80bc5e3b8e4108108ef7546119568151 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 25 Dec 2017 00:29:52 -0500 Subject: [PATCH 61/89] [dictionaries] removing ave/avens/aves from ambiguous --- resources/dictionaries/en/ambiguous_expansions.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/resources/dictionaries/en/ambiguous_expansions.txt b/resources/dictionaries/en/ambiguous_expansions.txt index f678ccb1..5f4ad757 100644 --- a/resources/dictionaries/en/ambiguous_expansions.txt +++ b/resources/dictionaries/en/ambiguous_expansions.txt @@ -16,9 +16,6 @@ apt arc art arty -ave -avens -aves ba bc bot From 152761fcbccc0c977d6c2f6643a97f33a5f2739c Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 25 Dec 2017 01:37:29 -0500 Subject: [PATCH 62/89] [expand] adding improvements to root expansions (using possible phrase roots even if they're abbreviated e.g. "E Ctr St", adding special valid components check for root expansions beyond what's stored in the build address dictionaries), removing spaces before checking unique strings, only splitting numeric from alpha in the case of non-ordinals, using cstring_array internally and char ** in the public API --- src/expand.c | 170 +++++++++++++++++++++++++++++++++++---------------- src/expand.h | 6 +- src/tokens.h | 2 + 3 files changed, 121 insertions(+), 57 deletions(-) diff --git a/src/expand.c b/src/expand.c index 80b4250e..4670280c 100644 --- a/src/expand.c +++ b/src/expand.c @@ -44,6 +44,39 @@ inline uint64_t get_normalize_string_options(libpostal_normalize_options_t optio return normalize_string_options; } + +inline size_t valid_ordinal_suffix_len(char *str, token_t token, token_t prev_token, char *lang) { + size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); + + int32_t unichr = 0; + const uint8_t *ptr = (const uint8_t *)str; + + if (len_ordinal_suffix > 0) { + ssize_t start = 0; + size_t token_offset = token.offset; + size_t token_len = token.len; + + if (len_ordinal_suffix < token.len) { + start = token.offset + token.len - len_ordinal_suffix; + token_offset = token.offset; + token_len = token.len - len_ordinal_suffix; + } else { + start = prev_token.offset + prev_token.len; + token_offset = prev_token.offset; + token_len = prev_token.len; + } + ssize_t prev_char_len = utf8proc_iterate_reversed(ptr, start, &unichr); + if (prev_char_len <= 0) return 0; + if (!utf8_is_digit(utf8proc_category(unichr)) && !is_roman_numeral_len(str + token_offset, token_len)) { + return 0; + } + } else { + return 0; + } + + return len_ordinal_suffix; +} + void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) { uint64_t normalize_token_options = get_normalize_token_options(options); @@ -82,6 +115,7 @@ void add_normalized_strings_token(cstring_array *strings, char *str, token_t tok } } else if (is_numeric_token(token.type)) { + normalize_token(strings, str, token, normalize_token_options); if (options.replace_word_hyphens || options.replace_numeric_hyphens) { @@ -105,9 +139,21 @@ void add_normalized_strings_token(cstring_array *strings, char *str, token_t tok } if (is_numeric_token(token.type) && options.split_alpha_from_numeric) { - normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; - normalize_token(strings, str, token, normalize_token_options); - normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; + bool split_alpha_from_numeric = true; + + for (size_t i = 0; i < options.num_languages; i++) { + char *lang = options.languages[i]; + if (valid_ordinal_suffix_len(str, token, NULL_TOKEN, lang) > 1) { + split_alpha_from_numeric = false; + break; + } + } + + if (split_alpha_from_numeric) { + normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; + normalize_token(strings, str, token, normalize_token_options); + normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC; + } } } else { cstring_array_add_string(strings, " "); @@ -492,7 +538,7 @@ inline uint32_t gazetteer_ignorable_components(uint16_t dictionary_id) { case DICTIONARY_STAIRCASE: return LIBPOSTAL_ADDRESS_STAIRCASE; case DICTIONARY_STOPWORD: - return LIBPOSTAL_ADDRESS_ANY; + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM; case DICTIONARY_STREET_TYPE: return LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_UNIT_NUMBERED: @@ -506,11 +552,31 @@ inline uint32_t gazetteer_ignorable_components(uint16_t dictionary_id) { } } + +inline uint32_t gazetteer_valid_components(uint16_t dictionary_id) { + switch (dictionary_id) { + case DICTIONARY_DIRECTIONAL: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE; + case DICTIONARY_STOPWORD: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM; + case DICTIONARY_STREET_TYPE: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_SYNONYM: + return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM; + default: + return LIBPOSTAL_ADDRESS_NONE; + } +} + inline uint32_t gazetteer_edge_ignorable_components(uint16_t dictionary_id) { switch (dictionary_id) { // Pre/post directionals can be removed if there are non-phrase tokens case DICTIONARY_DIRECTIONAL: return LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_COMPANY_TYPE: + return LIBPOSTAL_ADDRESS_NAME; + case DICTIONARY_PLACE_NAME: + return LIBPOSTAL_ADDRESS_NAME; default: return LIBPOSTAL_ADDRESS_NONE; } @@ -538,12 +604,14 @@ inline uint32_t gazetteer_possible_root_components(uint16_t dictionary_id) { switch (dictionary_id) { case DICTIONARY_ACADEMIC_DEGREE: return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + case DICTIONARY_DIRECTIONAL: + return LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_PERSONAL_TITLE: return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_NUMBER: return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_PLACE_NAME: - return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; + return LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_QUALIFIER: return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; case DICTIONARY_SYNONYM: @@ -559,7 +627,8 @@ typedef enum { GAZETTEER_MATCH_IGNORABLE, GAZETTEER_MATCH_EDGE_IGNORABLE, GAZETTEER_MATCH_POSSIBLE_ROOT, - GAZETTEER_MATCH_SPECIFIER + GAZETTEER_MATCH_SPECIFIER, + GAZETTEER_MATCH_VALID_COMPONENTS } gazetteer_match_type_t; @@ -580,6 +649,9 @@ inline bool address_expansion_matches_type_for_components(address_expansion_t ex case GAZETTEER_MATCH_SPECIFIER: components = gazetteer_specifier_components(dictionary_id); break; + case GAZETTEER_MATCH_VALID_COMPONENTS: + components = gazetteer_valid_components(dictionary_id); + break; default: break; } @@ -606,6 +678,11 @@ inline bool address_expansion_is_specifier_for_components(address_expansion_t ex return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_SPECIFIER); } +inline bool address_expansion_is_valid_for_components(address_expansion_t expansion, uint32_t address_components) { + return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_VALID_COMPONENTS); +} + + bool address_phrase_matches_type_for_components(phrase_t phrase, uint32_t address_components, gazetteer_match_type_t match_type) { uint32_t expansion_index = phrase.data; address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); @@ -642,6 +719,11 @@ inline bool address_phrase_is_specifier_for_components(phrase_t phrase, uint32_t return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_SPECIFIER); } +inline bool address_phrase_is_valid_for_components(phrase_t phrase, uint32_t address_components) { + return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_VALID_COMPONENTS); +} + + bool address_phrase_contains_unambiguous_expansion(phrase_t phrase) { address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data); if (value == NULL) return false; @@ -751,6 +833,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal bool have_non_phrase_tokens = false; bool have_canonical_phrases = false; bool have_ambiguous = false; + bool have_possible_root = false; bool have_strictly_ignorable = false; bool have_strictly_ignorable_abbreviation = false; @@ -796,6 +879,8 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal log_debug("have_strictly_ignorable=%zu, phrase_is_canonical=%zu\n", have_strictly_ignorable, phrase_is_canonical); } + have_possible_root = have_possible_root | address_phrase_is_possible_root_for_components(phrase, options.address_components); + have_canonical_phrases = have_canonical_phrases || (phrase_is_canonical && !phrase_is_ambiguous); have_ambiguous = have_ambiguous || phrase_is_ambiguous; @@ -875,7 +960,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal uint32_t expansion_index = phrase.data; address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index); - bool expansion_valid_components = value->components & options.address_components; + bool expansion_valid_components = (value->components & options.address_components) || address_phrase_is_valid_for_components(phrase, options.address_components); if (expansion_valid_components) { key->n = namespace_len; @@ -884,7 +969,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal if (token.type != WHITESPACE) { char_array_cat_len(key, str + token.offset, token.len); last_added_was_whitespace = false; - } else { + } else if (!last_added_was_whitespace) { char_array_cat(key, " "); last_added_was_whitespace = true; } @@ -902,6 +987,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal bool current_phrase_have_specifier = delete_phrases && address_phrase_is_specifier_for_components(phrase, options.address_components); bool current_phrase_have_canonical = delete_phrases && address_phrase_has_canonical_interpretation(phrase); + bool current_phrase_have_possible_root = delete_phrases && address_phrase_is_possible_root_for_components(phrase, options.address_components); log_debug("current_phrase_have_specifier = %d\n", current_phrase_have_specifier); @@ -950,9 +1036,9 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } } else { - // If we encounter an ignorable phrase like St and we're _not_ the end of the string e.g. "E St SE", this is probably a legit token instead of a pre-directional - skip_edge_phrase = address_phrase_is_possible_root_for_components(other_phrase, options.address_components) && address_phrase_has_canonical_interpretation(other_phrase); - log_debug("phrase is possible root = %d\n", skip_edge_phrase); + // If we encounter an ignorable phrase like St and we're _not_ the end of the string e.g. "E St SE", the first token is probably a legit token instead of a pre-directional + skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !((address_phrase_has_canonical_interpretation(other_phrase) || address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components)) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); + log_debug("phrase is possible root. skip_edge_phrase = %d\n", skip_edge_phrase); } break; } @@ -1019,7 +1105,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal bool is_ignorable = address_expansion_is_ignorable_for_components(expansion, options.address_components); bool is_canonical = expansion.canonical_index == NULL_CANONICAL_INDEX; - log_debug("is_ignorable = %d, is_canonical = %d, is_ambiguous = %d, current_phrase_have_ambiguous = %d, current_phrase_have_unambiguous = %d, have_strictly_ignorable = %d, current_phrase_have_ignorable=%d\n", is_ignorable, is_canonical, is_ambiguous, current_phrase_have_ambiguous, current_phrase_have_unambiguous, have_strictly_ignorable, current_phrase_have_ignorable); + log_debug("is_ignorable = %d, is_canonical = %d, is_ambiguous = %d, current_phrase_have_ambiguous = %d, current_phrase_have_unambiguous = %d, have_strictly_ignorable = %d, current_phrase_have_ignorable=%d, current_phrase_have_possible_root=%d\n", is_ignorable, is_canonical, is_ambiguous, current_phrase_have_ambiguous, current_phrase_have_unambiguous, have_strictly_ignorable, current_phrase_have_ignorable, current_phrase_have_possible_root); current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous; @@ -1034,22 +1120,22 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal log_debug("skip_edge_phrase = %d\n", skip_edge_phrase); current_phrase_ignorable = skip_edge_phrase; // Don't delete "PH" in "PH 1" for unit expansions - } else if (is_ignorable && have_non_phrase_tokens && current_phrase_have_specifier) { + } else if (is_ignorable && current_phrase_have_specifier) { log_debug("current_phrase_have_specifier\n"); current_phrase_ignorable = false; // Delete "Avenue" in "5th Avenue" } else if (is_ignorable && is_canonical && !current_phrase_have_ambiguous) { log_debug("is_ignorable && is_canonical && !current_phrase_have_ambiguous\n"); - current_phrase_ignorable = have_non_phrase_tokens || string_tree_num_tokens(tree) > 0; + current_phrase_ignorable = have_non_phrase_tokens || string_tree_num_tokens(tree) > 0 || (have_possible_root && !current_phrase_have_possible_root); log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); // Delete "Ave" in "5th Ave" or "Pl" in "Park Pl S" } else if (is_ignorable && !is_canonical && !is_ambiguous && !current_phrase_have_ambiguous) { log_debug("is_ignorable && !is_canonical && !current_phrase_have_ambiguous\n"); - current_phrase_ignorable = have_non_phrase_tokens || have_canonical_phrases || have_ambiguous; + current_phrase_ignorable = have_non_phrase_tokens || (have_possible_root && !current_phrase_have_possible_root) || string_tree_num_tokens(tree) > 0; log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); - } else if (current_phrase_have_ambiguous && (have_non_phrase_tokens || have_canonical_phrases)) { + } else if (current_phrase_have_ambiguous && (have_non_phrase_tokens || have_canonical_phrases || have_possible_root)) { log_debug("have_non_phrase_tokens = %d, have_canonical_phrases = %d\n", have_non_phrase_tokens, have_canonical_phrases); - current_phrase_ignorable = is_ignorable || (current_phrase_have_ambiguous && have_non_phrase_tokens && current_phrase_have_ignorable && current_phrase_have_unambiguous); + current_phrase_ignorable = (is_ignorable && !(have_possible_root && !current_phrase_have_possible_root)) || (current_phrase_have_ambiguous && have_non_phrase_tokens && current_phrase_have_ignorable && current_phrase_have_unambiguous); log_debug("current_phrase_have_ambiguous && have_non_phrase_tokens\n"); log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); @@ -1075,7 +1161,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal current_phrase_expandable = !current_phrase_ignorable; } - log_debug("expand_phrases = %d\n", expand_phrases); + log_debug("current_phrase_expandable = %d\n", current_phrase_expandable); log_debug("expansion.canonical_index = %d\n", expansion.canonical_index); @@ -1213,7 +1299,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } if (token.type != WHITESPACE) { - if (j > 0 && last_was_punctuation && !last_added_was_whitespace) { + if (j > 0 && last_was_punctuation && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0) { log_debug("Adding another space\n"); string_tree_add_string(tree, " "); string_tree_finalize_token(tree); @@ -1280,33 +1366,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { - size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); - - int32_t unichr = 0; - const uint8_t *ptr = (const uint8_t *)str; - - if (len_ordinal_suffix > 0) { - ssize_t start = 0; - size_t token_offset = token.offset; - size_t token_len = token.len; - - if (len_ordinal_suffix < token.len) { - start = token.offset + token.len - len_ordinal_suffix; - token_offset = token.offset; - token_len = token.len - len_ordinal_suffix; - } else { - start = prev_token.offset + prev_token.len; - token_offset = prev_token.offset; - token_len = prev_token.len; - } - ssize_t prev_char_len = utf8proc_iterate_reversed(ptr, start, &unichr); - if (prev_char_len <= 0) return false; - if (!utf8_is_digit(utf8proc_category(unichr)) && !is_roman_numeral_len(str + token_offset, token_len)) { - return false; - } - } else { - return false; - } + size_t len_ordinal_suffix = valid_ordinal_suffix_len(str, token, prev_token, lang); cstring_array *strings = tree->strings; // Add the original form first. When this function returns true, @@ -1440,15 +1500,17 @@ void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) * continue; } + char *dupe_token = strndup(token + left_spaces, token_len - left_spaces - right_spaces); + log_debug("full string=%s\n", token); - khiter_t k = kh_get(str_set, unique_strings, token); + khiter_t k = kh_get(str_set, unique_strings, dupe_token); if (k == kh_end(unique_strings)) { - char *dupe_token = strndup(str + left_spaces, len - left_spaces - right_spaces); - log_debug("doing postprocessing\n"); - add_postprocessed_string(strings, token, options); + add_postprocessed_string(strings, dupe_token, options); k = kh_put(str_set, unique_strings, dupe_token, &ret); + } else { + free(dupe_token); } log_debug("iter->remaining = %d\n", iter->remaining); @@ -1476,7 +1538,7 @@ void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) * -char **expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) { +cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) { options.address_components |= LIBPOSTAL_ADDRESS_ANY; uint64_t normalize_string_options = get_normalize_string_options(options); @@ -1551,15 +1613,15 @@ char **expand_address_phrase_option(char *input, libpostal_normalize_options_t o *n = cstring_array_num_strings(strings); - return cstring_array_to_strings(strings); + return strings; } -char **expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { +cstring_array *expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { return expand_address_phrase_option(input, options, n, EXPAND_PHRASES); } -char **expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) { +cstring_array *expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) { return expand_address_phrase_option(input, options, n, DELETE_PHRASES); } diff --git a/src/expand.h b/src/expand.h index 0f961f81..2857f402 100644 --- a/src/expand.h +++ b/src/expand.h @@ -48,9 +48,9 @@ typedef enum { DELETE_PHRASES } expansion_phrase_option_t; -char **expand_address(char *input, libpostal_normalize_options_t options, size_t *n); -char **expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option); -char **expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n); +cstring_array *expand_address(char *input, libpostal_normalize_options_t options, size_t *n); +cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option); +cstring_array *expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n); void expansion_array_destroy(char **expansions, size_t n); #endif diff --git a/src/tokens.h b/src/tokens.h index 8823a628..bf61f5bc 100644 --- a/src/tokens.h +++ b/src/tokens.h @@ -14,6 +14,8 @@ typedef libpostal_token_t token_t; +#define NULL_TOKEN (token_t){0, 0, END} + VECTOR_INIT(token_array, token_t) typedef struct tokenized_string { From 2afcd747797a8494d66e441222479381ad487808 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 25 Dec 2017 01:38:50 -0500 Subject: [PATCH 63/89] [test] adding E Ctr St tests --- test/test_expand.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/test_expand.c b/test/test_expand.c index 6436fb92..2b451295 100644 --- a/test/test_expand.c +++ b/test/test_expand.c @@ -163,6 +163,14 @@ TEST test_street_root_expansions(void) { CHECK_CALL(test_root_expansion_contains("Avenue Rd", "avenue", options)); CHECK_CALL(test_root_expansion_contains("Broadway", "broadway", options)); CHECK_CALL(test_root_expansion_contains("E Broadway", "east", options)); + CHECK_CALL(test_root_expansion_contains("E Center St", "center", options)); + CHECK_CALL(test_root_expansion_contains("E Ctr St", "center", options)); + CHECK_CALL(test_root_expansion_contains("E Center Street", "center", options)); + CHECK_CALL(test_root_expansion_contains("E Ctr Street", "center", options)); + CHECK_CALL(test_root_expansion_contains("Center St E", "center", options)); + CHECK_CALL(test_root_expansion_contains("Ctr St E", "center", options)); + CHECK_CALL(test_root_expansion_contains("Center Street E", "center", options)); + CHECK_CALL(test_root_expansion_contains("Ctr Street E", "center", options)); // Spanish CHECK_CALL(test_root_expansion_contains("C/ Ocho", "8", options)); From b17b2bdcc4b65cf22bce32da9941bbc9ff14136c Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 27 Dec 2017 19:12:01 -0500 Subject: [PATCH 64/89] [dictionaries] adding hill/hills to synonyms lists in English. In general any ambiguous street types that can also be part of a core street name can also be stored in synonyms --- resources/dictionaries/en/synonyms.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/resources/dictionaries/en/synonyms.txt b/resources/dictionaries/en/synonyms.txt index 98f09a4a..b6ac8907 100644 --- a/resources/dictionaries/en/synonyms.txt +++ b/resources/dictionaries/en/synonyms.txt @@ -18,6 +18,8 @@ greater|grtr|gtr greens|grns groves|grvs heights|hghts|hgts|hieghts|ht|hts|hgths +hill|hl +hills|hls international|intl|int'l lake|lk lakes|lks From b4fdc51bf952eb9eece330f8799ac032519522b1 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 27 Dec 2017 19:27:23 -0500 Subject: [PATCH 65/89] [numex] changing is_roman_numeral to is_likely_roman_numeral to get rid of most of the false positives like \"La\" in Spanish which could be L(=50) + the ordinal suffix \"a\", but in practice it never means that. For Roman numerals that are shorter than two characters (whether on their own like "DC" or "MD", or attached to a potential ordinal suffix like \"Ce\" in French), will be ignored unless they're composed of more likely, smaller, Roman numerals: I, V, and X, so VI, IX, etc. are expanded as Roman numerals but LI is not. --- src/numex.c | 153 ++++++++++++++++++++++++++++++++-------------------- src/numex.h | 4 +- 2 files changed, 98 insertions(+), 59 deletions(-) diff --git a/src/numex.c b/src/numex.c index 2d4161d9..7f4ef630 100644 --- a/src/numex.c +++ b/src/numex.c @@ -439,7 +439,7 @@ bool numex_table_read(FILE *f) { log_debug("read num_languages = %" PRIu64 "\n", num_languages); - int i = 0; + size_t i = 0; numex_language_t *language; @@ -541,7 +541,7 @@ bool numex_table_write(FILE *f) { numex_rule_t rule; - int i = 0; + size_t i = 0; for (i = 0; i < num_rules; i++) { rule = numex_table->rules->a[i]; @@ -1137,23 +1137,115 @@ size_t ordinal_suffix_len(char *str, size_t len, char *lang) { return 0; } + + +static inline bool is_roman_numeral_char(char c) { + return (c == 'i' || + c == 'v' || + c == 'x' || + c == 'l' || + c == 'c' || + c == 'd' || + c == 'm' || + c == 'I' || + c == 'V' || + c == 'X' || + c == 'L' || + c == 'C' || + c == 'D' || + c == 'M'); +} + +static inline bool is_likely_single_roman_numeral_char(char c) { + return (c == 'i' || + c == 'v' || + c == 'x' || + c == 'I' || + c == 'V' || + c == 'X'); +} + + +bool is_valid_roman_numeral(char *str, size_t len) { + char *copy = strndup(str, len); + if (copy == NULL) return false; + + numex_result_array *results = convert_numeric_expressions(copy, LATIN_LANGUAGE_CODE); + if (results == NULL) { + free(copy); + return false; + } + + bool ret = results->n == 1 && results->a[0].len == len; + numex_result_array_destroy(results); + free(copy); + return ret; +} + +bool is_likely_roman_numeral_len(char *str, size_t len) { + bool seen_roman = false; + for (size_t i = 0; i < len; i++) { + char c = *(str + i); + if (c == 0) break; + if ((len <= 2 && is_likely_single_roman_numeral_char(c)) || (len > 2 && is_roman_numeral_char(c))) { + seen_roman = true; + } else { + return false; + } + } + + return seen_roman && is_valid_roman_numeral(str, len); +} + +inline bool is_likely_roman_numeral(char *str) { + return is_likely_roman_numeral_len(str, strlen(str)); +} + char *replace_numeric_expressions(char *str, char *lang) { numex_result_array *results = convert_numeric_expressions(str, lang); if (results == NULL) return NULL; + bool is_latin = string_equals(lang, LATIN_LANGUAGE_CODE); + size_t len = strlen(str); char_array *replacement = char_array_new_size(len); size_t start = 0; size_t end = 0; - for (int i = 0; i < results->n; i++) { - numex_result_t result = results->a[i]; + bool have_valid_numex = false; + numex_result_t result = NULL_NUMEX_RESULT; + + for (size_t i = 0; i < results->n; i++) { + result = results->a[i]; if (result.len == 0) { continue; } + if (is_latin && result.len <= 2 && !is_likely_roman_numeral_len(str + result.start, result.len)) { + continue; + } + have_valid_numex = true; + break; + } + + if (!have_valid_numex) { + numex_result_array_destroy(results); + return NULL; + } + + for (size_t i = 0; i < results->n; i++) { + result = results->a[i]; + + if (result.len == 0) { + continue; + } + + if (is_latin && result.len <= 2 && !is_likely_roman_numeral_len(str + result.start, result.len)) { + continue; + } + end = result.start; log_debug("lang=%s, start = %zu, len = %zu, value=%" PRId64 "\n", lang, result.start, result.len, result.value); @@ -1184,56 +1276,3 @@ char *replace_numeric_expressions(char *str, char *lang) { return char_array_to_string(replacement); } - -static inline bool is_roman_numeral_char(char c) { - return (c == 'i' || - c == 'v' || - c == 'x' || - c == 'l' || - c == 'c' || - c == 'd' || - c == 'm' || - c == 'I' || - c == 'V' || - c == 'X' || - c == 'L' || - c == 'C' || - c == 'D' || - c == 'M'); -} - -bool is_valid_roman_numeral(char *str, size_t len) { - char *copy = strndup(str, len); - if (copy == NULL) return false; - - numex_result_array *results = convert_numeric_expressions(copy, LATIN_LANGUAGE_CODE); - if (results == NULL) { - free(copy); - return false; - } - - bool ret = results->n == 1 && results->a[0].len == len; - numex_result_array_destroy(results); - free(copy); - return ret; -} - -bool is_roman_numeral_len(char *str, size_t len) { - size_t i = 0; - bool seen_roman = false; - for (size_t i = 0; i < len; i++) { - char c = *(str + i); - if (c == 0) break; - if (is_roman_numeral_char(c)) { - seen_roman = true; - } else { - return false; - } - } - - return seen_roman && is_valid_roman_numeral(str, len); -} - -inline bool is_roman_numeral(char *str) { - return is_roman_numeral_len(str, strlen(str)); -} diff --git a/src/numex.h b/src/numex.h index f4536bb7..1a0d89b7 100644 --- a/src/numex.h +++ b/src/numex.h @@ -152,8 +152,8 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang); size_t ordinal_suffix_len(char *s, size_t len, char *lang); size_t possible_ordinal_digit_len(char *str, size_t len); -bool is_roman_numeral(char *str); -bool is_roman_numeral_len(char *str, size_t len); +bool is_likely_roman_numeral(char *str); +bool is_likely_roman_numeral_len(char *str, size_t len); bool numex_table_write(FILE *file); bool numex_table_save(char *filename); From d731339811c80221a112ef47792ef69f7488e861 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 27 Dec 2017 21:48:54 -0500 Subject: [PATCH 66/89] [expand] fixing case where too many permutations were getting added for longer strings due to the new-ish ordinal suffix handling, using string_tree_num_tokens instead of string_tree_num_strings throughout to check for previously added words, using new is_likely_roman_numeral API --- src/expand.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/expand.c b/src/expand.c index 4670280c..569dd4c4 100644 --- a/src/expand.c +++ b/src/expand.c @@ -67,7 +67,7 @@ inline size_t valid_ordinal_suffix_len(char *str, token_t token, token_t prev_to } ssize_t prev_char_len = utf8proc_iterate_reversed(ptr, start, &unichr); if (prev_char_len <= 0) return 0; - if (!utf8_is_digit(utf8proc_category(unichr)) && !is_roman_numeral_len(str + token_offset, token_len)) { + if (!utf8_is_digit(utf8proc_category(unichr)) && !is_likely_roman_numeral_len(str + token_offset, token_len)) { return 0; } } else { @@ -932,7 +932,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } if (token.type != WHITESPACE) { - if ((phrase.start > 0 && last_was_punctuation) || (!last_added_was_whitespace && string_tree_num_strings(tree) > 0) ) { + if ((phrase.start > 0 && last_was_punctuation) || (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) ) { log_debug("Adding space\n"); string_tree_add_string(tree, " "); string_tree_finalize_token(tree); @@ -942,7 +942,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); string_tree_finalize_token(tree); last_added_was_whitespace = false; - } else if (!delete_phrases && !last_added_was_whitespace && string_tree_num_strings(tree) > 0 ) { + } else if (!delete_phrases && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 ) { log_debug("Adding pre-phrase whitespace\n"); last_added_was_whitespace = true; string_tree_add_string(tree, " "); @@ -1065,12 +1065,14 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) { // don't delete the "E" in "Avenue E" log_debug("final phrase is edge ignorable out of two phrases. Checking previous phrase is ignorable.\n"); - skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))); + + skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))) && string_tree_num_tokens(tree) > 0; } else { log_debug("final phrase is not edge-ignorable out of two phrases. Checking previous phrase is edge ignorable.\n"); // delete "St" in "E St" other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase)); skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components)); + //skip_edge_phrase = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components); } } @@ -1224,7 +1226,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal log_debug("expansion_valid_components == %d\n", expansion_valid_components); if (added_expansions == 0 && (!delete_phrases || !expansion_valid_components)) { - if (!last_added_was_whitespace && string_tree_num_strings(tree) > 0) { + if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) { log_debug("Adding space\n"); string_tree_add_string(tree, " "); string_tree_finalize_token(tree); @@ -1322,7 +1324,6 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } - } else { log_debug("phrases NULL\n"); for (size_t j = 0; j < num_tokens; j++) { @@ -1335,7 +1336,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal } if (token.type != WHITESPACE) { - if (last_was_punctuation && !last_added_was_whitespace && string_tree_num_strings(tree) > 0) { + if (last_was_punctuation && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0) { log_debug("Adding space V\n"); string_tree_add_string(tree, " "); string_tree_finalize_token(tree); @@ -1343,7 +1344,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options); last_added_was_whitespace = false; - } else if (!last_added_was_whitespace && string_tree_num_strings(tree) > 0) { + } else if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) { log_debug("Adding space VI\n"); string_tree_add_string(tree, " "); last_added_was_whitespace = true; @@ -1368,15 +1369,18 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) { size_t len_ordinal_suffix = valid_ordinal_suffix_len(str, token, prev_token, lang); - cstring_array *strings = tree->strings; - // Add the original form first. When this function returns true, - // add_normalized_strings_token won't be called a second time. - add_normalized_strings_token(strings, str, token, options); + if (len_ordinal_suffix > 0) { + cstring_array *strings = tree->strings; + // Add the original form first. When this function returns true, + // add_normalized_strings_token won't be called a second time. + add_normalized_strings_token(strings, str, token, options); + token_t normalized_token = token; + normalized_token.len = token.len - len_ordinal_suffix; + add_normalized_strings_token(strings, str, normalized_token, options); + return true; + } - token_t normalized_token = token; - normalized_token.len = token.len - len_ordinal_suffix; - add_normalized_strings_token(strings, str, normalized_token, options); - return true; + return false; } inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) { From 33bb90d94b44d95c778eed4983235553137485ba Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 27 Dec 2017 22:13:04 -0500 Subject: [PATCH 67/89] [utils] adding place.h header, which converts parser-like output into an object that can be used for comparisons. Currently single-value, but could use cstring_arrays for fields instead. --- src/place.c | 181 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/place.h | 43 +++++++++++++ 2 files changed, 224 insertions(+) create mode 100644 src/place.c create mode 100644 src/place.h diff --git a/src/place.c b/src/place.c new file mode 100644 index 00000000..6f8a03ae --- /dev/null +++ b/src/place.c @@ -0,0 +1,181 @@ +#include "place.h" +#include "address_parser.h" + +inline bool is_address_text_component(char *label) { + return (string_equals(label, ADDRESS_PARSER_LABEL_HOUSE) || + string_equals(label, ADDRESS_PARSER_LABEL_ROAD) || + string_equals(label, ADDRESS_PARSER_LABEL_METRO_STATION) || + string_equals(label, ADDRESS_PARSER_LABEL_SUBURB) || + string_equals(label, ADDRESS_PARSER_LABEL_CITY_DISTRICT) || + string_equals(label, ADDRESS_PARSER_LABEL_CITY) || + string_equals(label, ADDRESS_PARSER_LABEL_STATE_DISTRICT) || + string_equals(label, ADDRESS_PARSER_LABEL_ISLAND) || + string_equals(label, ADDRESS_PARSER_LABEL_STATE) || + string_equals(label, ADDRESS_PARSER_LABEL_COUNTRY_REGION) || + string_equals(label, ADDRESS_PARSER_LABEL_COUNTRY) || + string_equals(label, ADDRESS_PARSER_LABEL_WORLD_REGION) + ); +} + +language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values) { + if (num_components == 0 || values == NULL || labels == NULL) return NULL; + + language_classifier_response_t *lang_response = NULL; + + char *label; + char *value; + + size_t total_size = 0; + for (size_t i = 0; i < num_components; i++) { + value = values[i]; + label = labels[i]; + if (is_address_text_component(label)) { + total_size += strlen(value); + // extra char for spaces + if (i < num_components - 1) { + total_size++; + } + } + } + + char_array *combined = char_array_new_size(total_size); + if (combined == NULL) { + return NULL; + } + + for (size_t i = 0; i < num_components; i++) { + value = values[i]; + label = labels[i]; + if (is_address_text_component(label)) { + char_array_cat(combined, value); + if (i < num_components - 1) { + char_array_cat(combined, " "); + } + } + } + + char *combined_input = char_array_get_string(combined); + + lang_response = classify_languages(combined_input); + + char_array_destroy(combined); + return lang_response; +} + + + +place_t *place_new(void) { + place_t *place = calloc(1, sizeof(place_t)); + return place; +} + +void place_destroy(place_t *place) { + if (place == NULL) return; + free(place); +} + + +place_t *place_from_components(size_t num_components, char **labels, char **values) { + if (num_components == 0 || labels == NULL || values == NULL) { + return NULL; + } + + place_t *place = place_new(); + if (place == NULL) return NULL; + + for (size_t i = 0; i < num_components; i++) { + char *value = values[i]; + char *label = labels[i]; + if (string_equals(label, ADDRESS_PARSER_LABEL_ROAD)) { + if (place->street == NULL) { + place->street = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_HOUSE)) { + if (place->name == NULL) { + place->name = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_HOUSE_NUMBER)) { + if (place->house_number == NULL) { + place->house_number = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_POSTAL_CODE)) { + if (place->postal_code == NULL) { + place->postal_code = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_CITY)) { + if (place->city == NULL) { + place->city = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_STATE)) { + if (place->state == NULL) { + place->state = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_COUNTRY)) { + if (place->country == NULL) { + place->country = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_SUBURB)) { + if (place->suburb == NULL) { + place->suburb = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_CITY_DISTRICT)) { + if (place->city_district == NULL) { + place->city_district = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_STATE_DISTRICT)) { + if (place->state_district == NULL) { + place->state_district = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_COUNTRY_REGION)) { + if (place->country_region == NULL) { + place->country_region = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_ISLAND)) { + if (place->island == NULL) { + place->island = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_WORLD_REGION)) { + if (place->world_region == NULL) { + place->world_region = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_UNIT)) { + if (place->unit == NULL) { + place->unit = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_TELEPHONE)) { + if (place->telephone == NULL) { + place->telephone = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_WEBSITE)) { + if (place->website == NULL) { + place->website = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_LEVEL)) { + if (place->level == NULL) { + place->level = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_PO_BOX)) { + if (place->po_box == NULL) { + place->po_box = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_BUILDING)) { + if (place->building == NULL) { + place->building = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_STAIRCASE)) { + if (place->staircase == NULL) { + place->staircase = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_ENTRANCE)) { + if (place->entrance == NULL) { + place->entrance = value; + } + } else if (string_equals(label, ADDRESS_PARSER_LABEL_METRO_STATION)) { + if (place->metro_station == NULL) { + place->metro_station = value; + } + } + } + + return place; +} diff --git a/src/place.h b/src/place.h new file mode 100644 index 00000000..88920582 --- /dev/null +++ b/src/place.h @@ -0,0 +1,43 @@ +#ifndef PLACE_H +#define PLACE_H + +#include +#include + +#include "libpostal.h" +#include "language_classifier.h" + +typedef struct place { + char *name; + char *house_number; + char *street; + char *building; + char *entrance; + char *staircase; + char *level; + char *unit; + char *po_box; + char *metro_station; + char *suburb; + char *city_district; + char *city; + char *state_district; + char *island; + char *state; + char *country_region; + char *country; + char *world_region; + char *postal_code; + char *telephone; + char *website; +} place_t; + +language_classifier_response_t *place_languages(size_t num_components, char **labels, char **values); + +place_t *place_new(void); + +place_t *place_from_components(size_t num_components, char **labels, char **values); + +void place_destroy(place_t *place); + +#endif \ No newline at end of file From b90c3dab4bbf73ba53cd93eac818c2b955ee99bc Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 28 Dec 2017 04:34:25 -0500 Subject: [PATCH 68/89] [similarity/dedupe] adding Soft-TFIDF implementation with several different fallback qualifiers for the max-sim function (Damerau-Levenshtein and libpostal's new bucketed affine gap method for detecting abbreviations), but keeping Jaro-Winkler as the secondary similarity function in the final distance metric. Overall this should results in higher similarity values when one of the tokens may not quite match the pure secondary threshold in terms of Jaro-Winkler but may match on one of the other criteria. --- src/soft_tfidf.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++ src/soft_tfidf.h | 46 +++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 src/soft_tfidf.c create mode 100644 src/soft_tfidf.h diff --git a/src/soft_tfidf.c b/src/soft_tfidf.c new file mode 100644 index 00000000..1bd43220 --- /dev/null +++ b/src/soft_tfidf.c @@ -0,0 +1,170 @@ +#include "soft_tfidf.h" +#include "float_utils.h" +#include "string_similarity.h" +#include "string_utils.h" + +static soft_tfidf_options_t DEFAULT_SOFT_TFIDF_OPTIONS = { + .jaro_winkler_min = 0.9, + .damerau_levenshtein_max = 1, + .damerau_levenshtein_min_length = 4, + .use_abbreviations = true +}; + + +soft_tfidf_options_t soft_tfidf_default_options(void) { + return DEFAULT_SOFT_TFIDF_OPTIONS; +} + + +double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options) { + if (token_scores1 == NULL || token_scores2 == NULL) return 0.0; + + if (num_tokens2 < num_tokens1) { + double *tmp_scores = token_scores1; + token_scores1 = token_scores2; + token_scores2 = tmp_scores; + char **tmp_tokens = tokens1; + tokens1 = tokens2; + tokens2 = tmp_tokens; + + size_t tmp_num_tokens = num_tokens1; + num_tokens1 = num_tokens2; + num_tokens2 = tmp_num_tokens; + } + + size_t len1 = num_tokens1; + size_t len2 = num_tokens2; + + double total_sim = 0.0; + + uint32_array **t1_tokens_unicode = NULL; + uint32_array **t2_tokens_unicode = NULL; + + uint32_array *t1_unicode; + uint32_array *t2_unicode; + + t1_tokens_unicode = calloc(len1, sizeof(uint32_array *)); + if (t1_tokens_unicode == NULL) { + total_sim = -1.0; + goto return_soft_tfidf_score; + } + for (size_t i = 0; i < len1; i++) { + t1_unicode = unicode_codepoints(tokens1[i]); + if (t1_unicode == NULL) { + total_sim = -1.0; + goto return_soft_tfidf_score; + } + t1_tokens_unicode[i] = t1_unicode; + } + + t2_tokens_unicode = calloc(len2, sizeof(uint32_array *)); + if (t2_tokens_unicode == NULL) { + total_sim = -1.0; + goto return_soft_tfidf_score; + } + + for (size_t i = 0; i < len2; i++) { + t2_unicode = unicode_codepoints(tokens2[i]); + if (t2_unicode == NULL) { + total_sim = -1.0; + goto return_soft_tfidf_score; + } + t2_tokens_unicode[i] = t2_unicode; + } + + double jaro_winkler_min = options.jaro_winkler_min; + size_t damerau_levenshtein_max = options.damerau_levenshtein_max; + size_t damerau_levenshtein_min_length = options.damerau_levenshtein_min_length; + bool use_damerau_levenshtein = damerau_levenshtein_max > 0 && len1 >= damerau_levenshtein_min_length; + + bool use_abbreviations = options.use_abbreviations; + + for (size_t i = 0; i < len1; i++) { + uint32_array *t1u = t1_tokens_unicode[i]; + uint32_array *t2u; + char *t1 = tokens1[i]; + double t1_score = token_scores1[i]; + + double max_sim = 0.0; + size_t min_dist = t1u->n; + size_t argmax_sim = 0; + size_t argmin_dist = 0; + double argmin_dist_sim = 0.0; + size_t last_abbreviation = 0; + double last_abbreviation_sim = 0.0; + bool have_abbreviation = false; + double t2_score; + + for (size_t j = 0; j < len2; j++) { + char *t2 = tokens2[j]; + t2u = t2_tokens_unicode[j]; + if (unicode_equals(t1u, t2u)) { + max_sim = 1.0; + argmax_sim = j; + break; + } + + double jaro_winkler = jaro_winkler_distance_unicode(t1u, t2u); + if (jaro_winkler > max_sim) { + max_sim = jaro_winkler; + argmax_sim = j; + } + + if (use_damerau_levenshtein) { + size_t replace_cost = 0; + ssize_t dist = damerau_levenshtein_distance_unicode(t1u, t2u, replace_cost); + if (dist >= 0 && dist < min_dist) { + min_dist = (size_t)dist; + argmin_dist = j; + argmin_dist_sim = jaro_winkler; + } + } + + if (use_abbreviations) { + bool is_abbreviation = possible_abbreviation_unicode(t1u, t2u); + if (is_abbreviation) { + last_abbreviation = j; + last_abbreviation_sim = jaro_winkler; + have_abbreviation = true; + } + } + } + + // Note: here edit distance, affine gap and abbreviations are only used in the thresholding process. + // Jaro-Winkler is still used to calculate similarity + + if (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)) { + t2_score = token_scores2[argmax_sim]; + total_sim += max_sim * t1_score * t2_score; + } else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) { + t2_score = token_scores2[argmin_dist]; + total_sim += argmin_dist_sim * t1_score * t2_score; + } else if (use_abbreviations && have_abbreviation) { + t2_score = token_scores2[last_abbreviation]; + total_sim += last_abbreviation_sim * t1_score * t2_score; + } + } + +return_soft_tfidf_score: + if (t1_tokens_unicode != NULL) { + for (size_t i = 0; i < len1; i++) { + t1_unicode = t1_tokens_unicode[i]; + if (t1_unicode != NULL) { + uint32_array_destroy(t1_unicode); + } + } + free(t1_tokens_unicode); + } + + if (t2_tokens_unicode != NULL) { + for (size_t i = 0; i < len2; i++) { + t2_unicode = t2_tokens_unicode[i]; + if (t2_unicode != NULL) { + uint32_array_destroy(t2_unicode); + } + } + free(t2_tokens_unicode); + } + + return total_sim; +} \ No newline at end of file diff --git a/src/soft_tfidf.h b/src/soft_tfidf.h new file mode 100644 index 00000000..7d777fc5 --- /dev/null +++ b/src/soft_tfidf.h @@ -0,0 +1,46 @@ +#ifndef SOFT_TFIDF_H +#define SOFT_TFIDF_H + +#include +#include "collections.h" +#include "libpostal.h" + +/* +This is a variant of Soft-TFIDF as described in: + +Cohen, Ravikumar, and Fienberg. A comparison of string distance +metrics for name-matching tasks. (2003) +https://www.cs.cmu.edu/~wcohen/postscript/ijcai-ws-2003.pdf + +Soft TFIDF is a hybrid similarity function for strings, typically names, +which combines both global statistics (TF-IDF) and a local similarity +function (e.g. Jaro-Winkler, which the authors suggest performs best). + +Given two strings, s1 and s2, each token t1 in s1 is matched with its most +similar counterpart t2 in s2 according to the local distance function. + +The Soft-TFIDF similarity is then the dot product of the max token +similarities and the cosine similarity of the TF-IDF vectors for all tokens +if the max similarity is >= a given threshold theta. + +This version is a modified Soft-TFIDF. Jaro-Winkler is used as the secondary +distance metric. However, the defintion of two tokens being "similar" is +defined as either: + +1. Jaro-Winkler distance >= theta +2. Damerau-Levenshtein edit distance <= max_edit_distance +3. Affine gap edit counts indicate a possible abbreviation (# matches == min(len1, len2)) +*/ + +typedef struct soft_tfidf_options { + double jaro_winkler_min; + size_t damerau_levenshtein_max; + size_t damerau_levenshtein_min_length; + bool use_abbreviations; +} soft_tfidf_options_t; + +soft_tfidf_options_t soft_tfidf_default_options(void); + +double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options); + +#endif \ No newline at end of file From 8fd4242eb8be2649171df6668c7c0d5e3fb47c94 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 28 Dec 2017 23:54:10 -0500 Subject: [PATCH 69/89] [fix] bug in Jaro distance --- src/string_similarity.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/string_similarity.c b/src/string_similarity.c index 953dbadd..bcef0380 100644 --- a/src/string_similarity.c +++ b/src/string_similarity.c @@ -438,8 +438,8 @@ double jaro_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) { size_t max_len = len1 > len2 ? len1 : len2; size_t match_distance = (max_len / 2) - 1; - uint8_t *u1_matches = calloc(len2, sizeof(uint8_t)); - uint8_t *u2_matches = calloc(len1, sizeof(uint8_t)); + uint8_t *u1_matches = calloc(len1, sizeof(uint8_t)); + uint8_t *u2_matches = calloc(len2, sizeof(uint8_t)); uint32_t *u1 = u1_array->a; uint32_t *u2 = u2_array->a; From cabdbfccd2e977bb2749a96fc904e720a6aac137 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 28 Dec 2017 23:55:41 -0500 Subject: [PATCH 70/89] [fix] using same order in root expansions --- src/expand.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/expand.c b/src/expand.c index 569dd4c4..1c887055 100644 --- a/src/expand.c +++ b/src/expand.c @@ -1128,7 +1128,7 @@ string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normal // Delete "Avenue" in "5th Avenue" } else if (is_ignorable && is_canonical && !current_phrase_have_ambiguous) { log_debug("is_ignorable && is_canonical && !current_phrase_have_ambiguous\n"); - current_phrase_ignorable = have_non_phrase_tokens || string_tree_num_tokens(tree) > 0 || (have_possible_root && !current_phrase_have_possible_root); + current_phrase_ignorable = have_non_phrase_tokens || (have_possible_root && !current_phrase_have_possible_root) || string_tree_num_tokens(tree) > 0; log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable); // Delete "Ave" in "5th Ave" or "Pl" in "Park Pl S" } else if (is_ignorable && !is_canonical && !is_ambiguous && !current_phrase_have_ambiguous) { From 24a77ea03f192a0cc5e65ae93a6b203b256a8490 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 29 Dec 2017 00:04:24 -0500 Subject: [PATCH 71/89] [fix] another valgrind error in counting transposes in our counting affine gap implementation (mixed indices) --- src/string_similarity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/string_similarity.c b/src/string_similarity.c index bcef0380..61882325 100644 --- a/src/string_similarity.c +++ b/src/string_similarity.c @@ -184,7 +184,7 @@ affine_gap_edits_t affine_gap_distance_unicode_costs(uint32_array *u1_array, uin bool is_transpose = false; size_t w = c1 != c2 && !both_separators ? mismatch_cost : match_cost; - if (c1 != c2 && utf8_is_letter(utf8proc_category(c2)) && utf8_is_letter(utf8proc_category(c1)) && i < n && c2 == u1[i] && j < m && c1 == u2[j]) { + if (c1 != c2 && utf8_is_letter(utf8proc_category(c2)) && utf8_is_letter(utf8proc_category(c1)) && j < m && c2 == u1[j] && i < n && c1 == u2[i]) { w = transpose_cost; is_transpose = true; } From f1e68865366ed1287f3f1d573d0f05e88416d248 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 29 Dec 2017 02:38:48 -0500 Subject: [PATCH 72/89] [similarity/dedupe] adding options for acronym alignments and address phrase matches in Soft-TFIDF. Acronym alignments will give higher similarity to NYU vs. "New York University" whereas phrase matches would match known phrases that share the same canonical like "Cty Rd" vs. "C.R." vs. "County Road" within the Soft-TFIDF similarity calculation. --- src/soft_tfidf.c | 236 ++++++++++++++++++++++++++++++++++++++++++++--- src/soft_tfidf.h | 2 + 2 files changed, 223 insertions(+), 15 deletions(-) diff --git a/src/soft_tfidf.c b/src/soft_tfidf.c index 1bd43220..f5f1800f 100644 --- a/src/soft_tfidf.c +++ b/src/soft_tfidf.c @@ -1,7 +1,9 @@ #include "soft_tfidf.h" +#include "address_dictionary.h" #include "float_utils.h" #include "string_similarity.h" #include "string_utils.h" +#include "log/log.h" static soft_tfidf_options_t DEFAULT_SOFT_TFIDF_OPTIONS = { .jaro_winkler_min = 0.9, @@ -15,11 +17,105 @@ soft_tfidf_options_t soft_tfidf_default_options(void) { return DEFAULT_SOFT_TFIDF_OPTIONS; } +bool compare_canonical(address_expansion_t e1, char **tokens1, phrase_t match1, address_expansion_t e2, char **tokens2, phrase_t match2) { + bool e1_canonical = e1.canonical_index == NULL_CANONICAL_INDEX; + bool e2_canonical = e2.canonical_index == NULL_CANONICAL_INDEX; -double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options) { + if (!e1_canonical && !e2_canonical) { + return e1.canonical_index == e2.canonical_index; + } else if (e1_canonical && e2_canonical) { + if (match1.len != match2.len || match1.len == 0) return false; + for (size_t i = 0; i < match1.len; i++) { + char *s1 = tokens1[match1.start + i]; + char *s2 = tokens2[match2.start + i]; + if (!string_equals(s1, s2)) return false; + } + return true; + } else { + char **canonical_tokens = e1_canonical ? tokens1 : tokens2; + char *other_canonical = e1_canonical ? address_dictionary_get_canonical(e2.canonical_index) : address_dictionary_get_canonical(e1.canonical_index); + phrase_t match = e1_canonical ? match1 : match2; + + size_t canonical_index = 0; + size_t canonical_len = strlen(other_canonical); + + for (size_t i = match.start; i < match.start + match.len; i++) { + char *canonical_token = canonical_tokens[i]; + size_t canonical_token_len = strlen(canonical_token); + + if (canonical_index + canonical_token_len <= canonical_len && strncmp(other_canonical + canonical_index, canonical_token, canonical_token_len) == 0) { + canonical_index += canonical_token_len; + + if (i < match.start + match.len - 1 && canonical_index < canonical_len && strncmp(other_canonical + canonical_index, " ", 1) == 0) { + canonical_index++; + } + } else { + return false; + } + } + return true; + } +} + +typedef enum { + CANONICAL_NO_MATCH = 0, + NEITHER_CANONICAL, + SECOND_CANONICAL, + FIRST_CANONICAL, + BOTH_CANONICAL +} canonical_match_t; + +bool phrases_have_same_canonical(size_t num_tokens1, char **tokens1, size_t num_tokens2, char **tokens2, phrase_t match1, phrase_t match2, canonical_match_t *response) { + address_expansion_value_t *val1 = address_dictionary_get_expansions(match1.data); + address_expansion_value_t *val2 = address_dictionary_get_expansions(match2.data); + + if (val1 == NULL || val2 == NULL) return false; + + address_expansion_array *expansions_array1 = val1->expansions; + address_expansion_array *expansions_array2 = val2->expansions; + + if (expansions_array1 == NULL || expansions_array2 == NULL) return false; + + address_expansion_t *expansions1 = expansions_array1->a; + address_expansion_t *expansions2 = expansions_array2->a; + + *response = CANONICAL_NO_MATCH; + + bool same_canonical = false; + for (size_t i = 0; i < expansions_array1->n; i++) { + address_expansion_t e1 = expansions1[i]; + + for (size_t j = 0; j < expansions_array2->n; j++) { + address_expansion_t e2 = expansions2[j]; + + same_canonical = compare_canonical(e1, tokens1, match1, e2, tokens2, match2); + if (same_canonical) { + bool e1_canonical = e1.canonical_index == NULL_CANONICAL_INDEX; + bool e2_canonical = e2.canonical_index == NULL_CANONICAL_INDEX; + + if (e1_canonical && e2_canonical) { + *response = BOTH_CANONICAL; + } else if (e1_canonical) { + *response = FIRST_CANONICAL; + } else if (e2_canonical) { + *response = SECOND_CANONICAL; + } else { + *response = NEITHER_CANONICAL; + } + break; + } + } + if (same_canonical) break; + } + + return same_canonical; +} + + +double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, soft_tfidf_options_t options) { if (token_scores1 == NULL || token_scores2 == NULL) return 0.0; - if (num_tokens2 < num_tokens1) { + if (num_tokens1 > num_tokens2) { double *tmp_scores = token_scores1; token_scores1 = token_scores2; token_scores2 = tmp_scores; @@ -27,6 +123,10 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s tokens1 = tokens2; tokens2 = tmp_tokens; + phrase_array *tmp_phrases = phrases1; + phrases1 = phrases2; + phrases2 = tmp_phrases; + size_t tmp_num_tokens = num_tokens1; num_tokens1 = num_tokens2; num_tokens2 = tmp_num_tokens; @@ -43,6 +143,14 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s uint32_array *t1_unicode; uint32_array *t2_unicode; + int64_array *phrase_memberships_array1 = NULL; + int64_array *phrase_memberships_array2 = NULL; + int64_t *phrase_memberships1 = NULL; + int64_t *phrase_memberships2 = NULL; + + int64_array *acronym_memberships_array = NULL; + int64_t *acronym_memberships = NULL; + t1_tokens_unicode = calloc(len1, sizeof(uint32_array *)); if (t1_tokens_unicode == NULL) { total_sim = -1.0; @@ -63,13 +171,37 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s goto return_soft_tfidf_score; } - for (size_t i = 0; i < len2; i++) { - t2_unicode = unicode_codepoints(tokens2[i]); + for (size_t j = 0; j < len2; j++) { + t2_unicode = unicode_codepoints(tokens2[j]); if (t2_unicode == NULL) { total_sim = -1.0; goto return_soft_tfidf_score; } - t2_tokens_unicode[i] = t2_unicode; + t2_tokens_unicode[j] = t2_unicode; + } + + + if (phrases1 != NULL && phrases2 != NULL) { + phrase_memberships_array1 = int64_array_new(); + phrase_memberships_array2 = int64_array_new(); + token_phrase_memberships(phrases1, phrase_memberships_array1, len1); + token_phrase_memberships(phrases2, phrase_memberships_array2, len2); + + if (phrase_memberships_array1->n == len1) { + phrase_memberships1 = phrase_memberships_array1->a; + } + + if (phrase_memberships_array2->n == len2) { + phrase_memberships2 = phrase_memberships_array2->a; + } + } + + if (acronym_alignments != NULL) { + acronym_memberships_array = int64_array_new(); + token_phrase_memberships(acronym_alignments, acronym_memberships_array, len2); + if (acronym_memberships_array->n == len2) { + acronym_memberships = acronym_memberships_array->a; + } } double jaro_winkler_min = options.jaro_winkler_min; @@ -92,24 +224,65 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s double argmin_dist_sim = 0.0; size_t last_abbreviation = 0; double last_abbreviation_sim = 0.0; - bool have_abbreviation = false; + bool have_abbreviation = false; + bool have_acronym_match = false; + phrase_t acronym_phrase = NULL_PHRASE; + bool have_phrase_match = false; + int64_t pm1 = phrase_memberships1 != NULL ? phrase_memberships1[i] : NULL_PHRASE_MEMBERSHIP; + phrase_t p1 = pm1 >= 0 ? phrases1->a[pm1] : NULL_PHRASE; + phrase_t argmax_phrase = NULL_PHRASE; + + canonical_match_t best_canonical_phrase_response = CANONICAL_NO_MATCH; + double t2_score; for (size_t j = 0; j < len2; j++) { char *t2 = tokens2[j]; t2u = t2_tokens_unicode[j]; + int64_t pm2 = phrase_memberships2 != NULL ? phrase_memberships2[j] : NULL_PHRASE_MEMBERSHIP; + phrase_t p2 = pm2 >= 0 ? phrases2->a[pm2] : NULL_PHRASE; + + canonical_match_t canonical_response = CANONICAL_NO_MATCH; + if (p1.len > 0 && p2.len > 0 && phrases_have_same_canonical(num_tokens1, tokens1, num_tokens2, tokens2, p1, p2, &canonical_response)) { + if (canonical_response > best_canonical_phrase_response) { + best_canonical_phrase_response = canonical_response; + argmax_sim = j; + argmax_phrase = p2; + max_sim = 1.0; + have_phrase_match = true; + continue; + } + } + if (unicode_equals(t1u, t2u)) { max_sim = 1.0; argmax_sim = j; break; } + if (acronym_memberships != NULL) { + int64_t acronym_membership = acronym_memberships[j]; + log_debug("acronym_membership = %zd\n", acronym_membership); + if (acronym_membership >= 0) { + acronym_phrase = acronym_alignments->a[acronym_membership]; + uint32_t acronym_match_index = acronym_phrase.data; + if (acronym_match_index == i) { + max_sim = 1.0; + argmax_sim = j; + have_acronym_match = true; + log_debug("have acronym match\n"); + break; + } + } + } + double jaro_winkler = jaro_winkler_distance_unicode(t1u, t2u); if (jaro_winkler > max_sim) { max_sim = jaro_winkler; argmax_sim = j; } + if (use_damerau_levenshtein) { size_t replace_cost = 0; ssize_t dist = damerau_levenshtein_distance_unicode(t1u, t2u, replace_cost); @@ -128,20 +301,36 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s have_abbreviation = true; } } + } // Note: here edit distance, affine gap and abbreviations are only used in the thresholding process. // Jaro-Winkler is still used to calculate similarity - if (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)) { - t2_score = token_scores2[argmax_sim]; - total_sim += max_sim * t1_score * t2_score; - } else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) { - t2_score = token_scores2[argmin_dist]; - total_sim += argmin_dist_sim * t1_score * t2_score; - } else if (use_abbreviations && have_abbreviation) { - t2_score = token_scores2[last_abbreviation]; - total_sim += last_abbreviation_sim * t1_score * t2_score; + if (!have_acronym_match && !have_phrase_match) { + if (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)) { + log_debug("have max sim = %f\n", max_sim); + t2_score = token_scores2[argmax_sim]; + total_sim += max_sim * t1_score * t2_score; + } else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) { + log_debug("levenshtein\n"); + t2_score = token_scores2[argmin_dist]; + total_sim += argmin_dist_sim * t1_score * t2_score; + } else if (use_abbreviations && have_abbreviation) { + log_debug("have abbreviation\n"); + t2_score = token_scores2[last_abbreviation]; + total_sim += last_abbreviation_sim * t1_score * t2_score; + } + } else if (have_phrase_match) { + for (size_t p = argmax_phrase.start; p < argmax_phrase.start + argmax_phrase.len; p++) { + t2_score = token_scores2[p]; + total_sim += max_sim * t1_score * t2_score; + } + } else { + for (size_t p = acronym_phrase.start; p < acronym_phrase.start + acronym_phrase.len; p++) { + t2_score = token_scores2[p]; + total_sim += max_sim * t1_score * t2_score; + } } } @@ -166,5 +355,22 @@ return_soft_tfidf_score: free(t2_tokens_unicode); } + if (phrase_memberships_array1 != NULL) { + int64_array_destroy(phrase_memberships_array1); + } + + if (phrase_memberships_array2 != NULL) { + int64_array_destroy(phrase_memberships_array2); + } + + if (acronym_memberships_array != NULL) { + int64_array_destroy(acronym_memberships_array); + } + return total_sim; +} + + +double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options) { + return soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, NULL, num_tokens2, tokens2, token_scores2, NULL, NULL, options); } \ No newline at end of file diff --git a/src/soft_tfidf.h b/src/soft_tfidf.h index 7d777fc5..244578ba 100644 --- a/src/soft_tfidf.h +++ b/src/soft_tfidf.h @@ -4,6 +4,7 @@ #include #include "collections.h" #include "libpostal.h" +#include "trie_search.h" /* This is a variant of Soft-TFIDF as described in: @@ -41,6 +42,7 @@ typedef struct soft_tfidf_options { soft_tfidf_options_t soft_tfidf_default_options(void); +double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, soft_tfidf_options_t options); double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options); #endif \ No newline at end of file From c5ad080fb0ed3c85f83d65e64965330375a28dd3 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 29 Dec 2017 02:42:22 -0500 Subject: [PATCH 73/89] [similarity] moving stopword tokens array to a separate function in acronym token alignments --- src/acronyms.c | 54 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/src/acronyms.c b/src/acronyms.c index ed91b5a6..425b64f2 100644 --- a/src/acronyms.c +++ b/src/acronyms.c @@ -1,5 +1,33 @@ #include "acronyms.h" +static uint32_array *stopword_tokens(const char *str, token_array *tokens, size_t num_languages, char **languages) { + size_t len = tokens->n; + uint32_array *stopwords_array = uint32_array_new_zeros(len); + + uint32_t *stopwords = stopwords_array->a; + + for (size_t l = 0; l < num_languages; l++) { + char *lang = languages[l]; + phrase_array *lang_phrases = search_address_dictionaries_tokens((char *)str, tokens, lang); + + if (lang_phrases != NULL) { + size_t num_lang_phrases = lang_phrases->n; + for (size_t p = 0; p < num_lang_phrases; p++) { + phrase_t phrase = lang_phrases->a[p]; + + if (address_phrase_in_dictionary(phrase, DICTIONARY_STOPWORD)) { + for (size_t stop_idx = phrase.start; stop_idx < phrase.start + phrase.len; stop_idx++) { + stopwords[stop_idx] = 1; + } + } + } + phrase_array_destroy(lang_phrases); + } + } + + return stopwords_array; +} + phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages) { if (s1 == NULL || tokens1 == NULL || s2 == NULL || tokens2 == NULL) { return NULL; @@ -28,29 +56,13 @@ phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, con token_t *t1 = tokens1->a; token_t *t2 = tokens2->a; - uint32_array *stopwords_array = uint32_array_new_zeros(len2); + uint32_array *stopwords_array = stopword_tokens(s2, tokens2, num_languages, languages); + if (stopwords_array == NULL) { + return NULL; + } uint32_t *stopwords = stopwords_array->a; - for (size_t l = 0; l < num_languages; l++) { - char *lang = languages[l]; - phrase_array *lang_phrases = search_address_dictionaries_tokens((char *)s2, tokens2, lang); - - if (lang_phrases != NULL) { - size_t num_lang_phrases = lang_phrases->n; - for (size_t p = 0; p < num_lang_phrases; p++) { - phrase_t phrase = lang_phrases->a[p]; - - if (address_phrase_in_dictionary(phrase, DICTIONARY_STOPWORD)) { - for (size_t stop_idx = phrase.start; stop_idx < phrase.start + phrase.len; stop_idx++) { - stopwords[stop_idx] = 1; - } - } - } - phrase_array_destroy(lang_phrases); - } - } - ssize_t acronym_start = -1; ssize_t acronym_token_pos = -1; @@ -136,5 +148,3 @@ phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, con return alignments; } - - From 1d1ce10fadcbd9a9a38d94e353925dfdf2781985 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 29 Dec 2017 03:08:48 -0500 Subject: [PATCH 74/89] [similarity] adding a string array version of Jaccard similarity that creates the string sets internally for convenience --- src/jaccard.c | 48 +++++++++++++++++++++++++++++++++++++++++++++--- src/jaccard.h | 1 + 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/src/jaccard.c b/src/jaccard.c index 1f96c61f..87e27b8b 100644 --- a/src/jaccard.c +++ b/src/jaccard.c @@ -1,8 +1,7 @@ #include "jaccard.h" - double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2) { - if (s1 == NULL || s2 == NULL) return -1.0; + if (s1 == NULL || s2 == NULL) return 0.0; size_t set_intersection = 0; size_t set_union = 0; @@ -24,4 +23,47 @@ double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2) { set_union += kh_size(s2); return (double)set_intersection / set_union; -} \ No newline at end of file +} + + +double jaccard_similarity_string_arrays(size_t num_strings1, char **strings1, size_t num_strings2, char **strings2) { + if (strings1 == NULL || strings2 == NULL || num_strings1 == 0 || num_strings2 == 0) return 0.0; + + khash_t(str_set) *string_set1 = kh_init(str_set); + if (string_set1 == NULL) return 0.0; + + kh_resize(str_set, string_set1, num_strings1); + int ret = 0; + + khiter_t k; + + for (size_t i = 0; i < num_strings1; i++) { + char *str1 = strings1[i]; + k = kh_put(str_set, string_set1, str1, &ret); + if (ret < 0) { + kh_destroy(str_set, string_set1); + return 0.0; + } + } + + khash_t(str_set) *string_set2 = kh_init(str_set); + if (string_set2 == NULL) { + kh_destroy(str_set, string_set1); + return 0.0; + } + kh_resize(str_set, string_set2, num_strings2); + for (size_t i = 0; i < num_strings2; i++) { + char *str2 = strings2[i]; + k = kh_put(str_set, string_set2, str2, &ret); + if (ret < 0) { + kh_destroy(str_set, string_set1); + kh_destroy(str_set, string_set2); + return 0.0; + } + } + + double sim = jaccard_similarity(string_set1, string_set2); + kh_destroy(str_set, string_set1); + kh_destroy(str_set, string_set2); + return sim; +} diff --git a/src/jaccard.h b/src/jaccard.h index a6468078..9f93266d 100644 --- a/src/jaccard.h +++ b/src/jaccard.h @@ -7,5 +7,6 @@ #include "collections.h" double jaccard_similarity(khash_t(str_set) *s1, khash_t(str_set) *s2); +double jaccard_similarity_string_arrays(size_t num_strings1, char **strings1, size_t num_strings2, char **strings2); #endif \ No newline at end of file From 1f1412c1205844268a2352880fd04f7d8949155e Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 29 Dec 2017 03:32:41 -0500 Subject: [PATCH 75/89] [api] adding libpostal_place_languages method to public API for classifying languages consistently from components (may need to make several calls using the same languages and don't necessarily want the language classifier to be run on house numbers when we already know the languages from e.g. the street name - this provides a simple window into the language classifier focused on the entire address/record --- src/libpostal.c | 14 ++++++++++++++ src/libpostal.h | 7 +++++++ 2 files changed, 21 insertions(+) diff --git a/src/libpostal.c b/src/libpostal.c index 2825bab4..03a7dd9d 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -13,6 +13,7 @@ #include "language_classifier.h" #include "near_dupe.h" #include "normalize.h" +#include "place.h" #include "scanner.h" #include "string_utils.h" #include "token_types.h" @@ -94,6 +95,19 @@ char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels return cstring_array_to_strings(strings); } + +char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) { + language_classifier_response_t *lang_response = place_languages(num_components, labels, values); + + char **languages = lang_response->languages; + lang_response->languages = NULL; + *num_languages = lang_response->num_languages; + lang_response->num_languages = 0; + + language_classifier_response_destroy(lang_response); + return languages; +} + void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) { if (self == NULL) return; diff --git a/src/libpostal.h b/src/libpostal.h index dc1c3c4e..7c3b7e76 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -172,6 +172,9 @@ LIBPOSTAL_EXPORT bool libpostal_parser_print_features(bool print_features); Deduping */ + +// Near-dupe hashing methods + typedef struct libpostal_near_dupe_hash_options { bool with_name; bool with_address; @@ -193,6 +196,10 @@ LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_near_dupe_hash_def LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes); LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes); +// Dupe language classification + +LIBPOSTAL_EXPORT char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages); + // Setup/teardown methods LIBPOSTAL_EXPORT bool libpostal_setup(void); From 098babfdee01b2e738a290910b79a7fa2acdf818 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 29 Dec 2017 04:48:00 -0500 Subject: [PATCH 76/89] [dedupe] adding the core pairwise deduping module which ties together most of the work on this branch. Includes simple phrase-aware exact deduping methods, with per-component variations as to whether e.g. a root expansion match counts as an exact duplicate or not (in a secondary unit, "No. 2" and "Apt 2" can be considered an exact match in English whereas we wouldn't want to make that kind of assumption for street e.g. "Park Ave" and "Park Pl"). The API is fairly low-level at present, and may require a few calls. Notably, we leave the TFIDF scores or other weighting schemes to the client. Since each component gets its own dupe classification, it leaves the door open for doing more specific checks around e.g. compound house numbers/ranges in the future. --- src/dedupe.c | 391 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/dedupe.h | 23 +++ 2 files changed, 414 insertions(+) create mode 100644 src/dedupe.c create mode 100644 src/dedupe.h diff --git a/src/dedupe.c b/src/dedupe.c new file mode 100644 index 00000000..bbf613a0 --- /dev/null +++ b/src/dedupe.c @@ -0,0 +1,391 @@ +#include "acronyms.h" +#include "address_parser.h" +#include "dedupe.h" +#include "expand.h" +#include "float_utils.h" +#include "jaccard.h" +#include "place.h" +#include "scanner.h" +#include "soft_tfidf.h" +#include "token_types.h" + +bool expansions_intersect(cstring_array *expansions1, cstring_array *expansions2) { + size_t n1 = cstring_array_num_strings(expansions1); + size_t n2 = cstring_array_num_strings(expansions2); + + bool intersect = false; + + for (size_t i = 0; i < n1; i++) { + char *e1 = cstring_array_get_string(expansions1, i); + for (size_t j = 0; j < n2; j++) { + char *e2 = cstring_array_get_string(expansions2, j); + if (string_equals(e1, e2)) { + intersect = true; + break; + } + } + if (intersect) break; + } + return intersect; +} + + +bool address_component_equals_root_option(char *s1, char *s2, libpostal_normalize_options_t options, bool root) { + uint64_t normalize_string_options = get_normalize_string_options(options); + + size_t n1, n2; + cstring_array *expansions1 = NULL; + cstring_array *expansions2 = NULL; + if (!root) { + expansions1 = expand_address(s1, options, &n1); + } else { + expansions1 = expand_address_root(s1, options, &n1); + } + + if (expansions1 == NULL) return false; + + if (!root) { + expansions2 = expand_address(s2, options, &n2); + } else { + expansions2 = expand_address_root(s2, options, &n2); + } + + if (expansions2 == NULL) { + cstring_array_destroy(expansions1); + return false; + } + + bool intersect = expansions_intersect(expansions1, expansions2); + + cstring_array_destroy(expansions1); + cstring_array_destroy(expansions2); + + return intersect; +} + +static inline bool address_component_equals(char *s1, char *s2, libpostal_normalize_options_t options) { + return address_component_equals_root_option(s1, s2, options, false); +} + +static inline bool address_component_equals_root(char *s1, char *s2, libpostal_normalize_options_t options) { + return address_component_equals_root_option(s1, s2, options, true); +} + + +static inline bool address_component_equals_root_fallback(char *s1, char *s2, libpostal_normalize_options_t options, bool root) { + return address_component_equals_root(s1, s2, options) || address_component_equals(s1, s2, options); +} + +libpostal_duplicate_status_t is_duplicate(char *value1, char *value2, libpostal_normalize_options_t normalize_options, libpostal_duplicate_options_t options, bool root_comparison_first, libpostal_duplicate_status_t root_comparison_status) { + if (value1 == NULL || value2 == NULL) { + return LIBPOSTAL_NULL_DUPLICATE_STATUS; + } + + normalize_options.num_languages = options.num_languages; + normalize_options.languages = options.languages; + + normalize_options.address_components |= LIBPOSTAL_ADDRESS_ANY; + + if (root_comparison_first) { + if (address_component_equals_root(value1, value2, normalize_options)) { + return root_comparison_status; + } else if (address_component_equals(value1, value2, normalize_options)) { + return LIBPOSTAL_EXACT_DUPLICATE; + } + } else { + if (address_component_equals(value1, value2, normalize_options)) { + return LIBPOSTAL_EXACT_DUPLICATE; + } else if (address_component_equals_root(value1, value2, normalize_options)) { + return root_comparison_status; + } + } + return LIBPOSTAL_NON_DUPLICATE; +} + +libpostal_duplicate_status_t is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME; + bool root_comparison_first = false; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} +libpostal_duplicate_status_t is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET; + bool root_comparison_first = false; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_HOUSE_NUMBER; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_UNIT; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_LEVEL; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_PO_BOX; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_POSTAL_CODE; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_TOPONYM; + + place_t *place1 = place_from_components(num_components1, labels1, values1); + place_t *place2 = place_from_components(num_components2, labels2, values2); + + bool city_match = false; + libpostal_duplicate_status_t dupe_status = LIBPOSTAL_NON_DUPLICATE; + + if (place1->city != NULL && place2->city != NULL) { + city_match = address_component_equals(place1->city, place2->city, normalize_options); + dupe_status = LIBPOSTAL_EXACT_DUPLICATE; + } + + if (!city_match && place1->city == NULL && place1->city_district != NULL && place2->city != NULL) { + city_match = address_component_equals(place1->city_district, place2->city, normalize_options); + dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + } + + if (!city_match && place1->city == NULL && place1->suburb != NULL && place2->city != NULL) { + city_match = address_component_equals(place1->suburb, place2->city, normalize_options); + dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + } + + if (!city_match && place2->city == NULL && place2->city_district != NULL && place1->city != NULL) { + city_match = address_component_equals(place1->city, place2->city_district, normalize_options); + dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + } + + if (!city_match && place2->city == NULL && place2->suburb != NULL && place1->city != NULL) { + city_match = address_component_equals(place1->suburb, place2->suburb, normalize_options); + dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + } + + if (!city_match) { + goto exit_destroy_places; + } + + if (city_match && place1->state_district != NULL && place2->state_district != NULL && !address_component_equals_root(place1->state_district, place2->state_district, normalize_options)) { + dupe_status = LIBPOSTAL_NON_DUPLICATE; + goto exit_destroy_places; + } + + if (city_match && place1->state != NULL && place2->state != NULL && !address_component_equals(place1->state, place2->state, normalize_options)) { + dupe_status = LIBPOSTAL_NON_DUPLICATE; + goto exit_destroy_places; + } + + if (city_match && place1->country != NULL && place2->country != NULL && !address_component_equals(place1->country, place2->country, normalize_options)) { + dupe_status = LIBPOSTAL_NON_DUPLICATE; + goto exit_destroy_places; + } + +exit_destroy_places: + place_destroy(place1); + place_destroy(place2); + return dupe_status; + +} + +char *joined_string_and_tokens_from_strings(char **strings, size_t num_strings, token_array *tokens) { + if (tokens == NULL || strings == NULL || num_strings == 0) return NULL; + token_array_clear(tokens); + + size_t full_len = 0; + for (size_t i = 0; i < num_strings; i++) { + full_len += strlen(strings[i]); + if (i < num_strings - 1) full_len++; + } + + char_array *a = char_array_new_size(full_len); + for (size_t i = 0; i < num_strings; i++) { + char *str = strings[i]; + size_t len = strlen(str); + size_t offset = a->n; + char_array_append(a, str); + + scanner_t scanner = scanner_from_string(str, len); + uint16_t token_type = scan_token(&scanner); + + token_t token = (token_t){offset, len, token_type}; + token_array_push(tokens, token); + if (i < num_strings - 1 && !is_ideographic(token.type)) { + char_array_append(a, " "); + } + } + + char_array_terminate(a); + return char_array_to_string(a); +} + +bool have_ideographic_word_tokens(token_array *token_array) { + if (token_array == NULL) return false; + + size_t n = token_array->n; + token_t *tokens = token_array->a; + for (size_t i = 0; i < n; i++) { + token_t token = tokens[i]; + if (is_ideographic(token.type) && is_word_token(token.type)) { + return true; + } + } + return false; +} + +libpostal_duplicate_status_similarity_t is_fuzzy_duplicate(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options, libpostal_normalize_options_t normalize_options, soft_tfidf_options_t soft_tfidf_options, bool do_acronyms) { + normalize_options.num_languages = options.num_languages; + normalize_options.languages = options.languages; + + normalize_options.address_components |= LIBPOSTAL_ADDRESS_ANY; + + double max_sim = 0.0; + + // Default is non-duplicate; + libpostal_duplicate_status_t dupe_status = LIBPOSTAL_NON_DUPLICATE; + + token_array *token_array1 = token_array_new_size(num_tokens1); + char *joined1 = joined_string_and_tokens_from_strings(tokens1, num_tokens1, token_array1); + + token_array *token_array2 = token_array_new_size(num_tokens2); + char *joined2 = joined_string_and_tokens_from_strings(tokens2, num_tokens2, token_array2); + + size_t num_languages = options.num_languages; + char **languages = options.languages; + + phrase_array *acronym_alignments = NULL; + + phrase_array *phrases1 = NULL; + phrase_array *phrases2 = NULL; + + bool is_ideographic = have_ideographic_word_tokens(token_array1) && have_ideographic_word_tokens(token_array2); + + if (!is_ideographic) { + if (do_acronyms) { + acronym_alignments = acronym_token_alignments(joined1, token_array1, joined2, token_array2, num_languages, languages); + } + + if (num_languages > 0) { + phrases1 = phrase_array_new(); + phrases2 = phrase_array_new(); + + for (size_t i = 0; i < num_languages; i++) { + char *lang = languages[i]; + phrase_array_clear(phrases1); + phrase_array_clear(phrases2); + + search_address_dictionaries_tokens_with_phrases(joined1, token_array1, lang, &phrases1); + search_address_dictionaries_tokens_with_phrases(joined2, token_array2, lang, &phrases2); + + double sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, num_tokens2, tokens2, token_scores2, phrases2, acronym_alignments, soft_tfidf_options); + if (sim > max_sim) { + max_sim = sim; + } + } + } else if (do_acronyms) { + max_sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, num_tokens2, tokens2, token_scores2, phrases2, acronym_alignments, soft_tfidf_options); + } else { + max_sim = soft_tfidf_similarity(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, soft_tfidf_options); + } + } else { + max_sim = jaccard_similarity_string_arrays(num_tokens1, tokens1, num_tokens2, tokens2); + if (string_equals(joined1, joined2)) { + dupe_status = LIBPOSTAL_EXACT_DUPLICATE; + } else if (address_component_equals_root(joined1, joined2, normalize_options)) { + dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + } + } + + if (dupe_status == LIBPOSTAL_NON_DUPLICATE) { + if (max_sim > options.likely_dupe_threshold || double_equals(max_sim, options.likely_dupe_threshold)) { + dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + } else if (max_sim > options.needs_review_threshold || double_equals(max_sim, options.needs_review_threshold)) { + dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + } + } + + if (phrases1 != NULL) { + phrase_array_destroy(phrases1); + } + + if (phrases2 != NULL) { + phrase_array_destroy(phrases2); + } + + if (acronym_alignments != NULL) { + phrase_array_destroy(acronym_alignments); + } + + if (token_array1 != NULL) { + token_array_destroy(token_array1); + } + + if (joined1 != NULL) { + free(joined1); + } + + if (token_array2 != NULL) { + token_array_destroy(token_array2); + } + + if (joined2 != NULL) { + free(joined2); + } + + return (libpostal_duplicate_status_similarity_t){dupe_status, max_sim}; +} + +inline libpostal_duplicate_status_similarity_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME; + + bool do_acronyms = true; + + soft_tfidf_options_t soft_tfidf_options = soft_tfidf_default_options(); + + return is_fuzzy_duplicate(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options, normalize_options, soft_tfidf_options, do_acronyms); +} + + +inline libpostal_duplicate_status_similarity_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET; + + // General purpose acronyms didn't make as much sense in the street name context + // things like County Road = CR should be handled by the address dictionaries + bool do_acronyms = false; + + soft_tfidf_options_t soft_tfidf_options = soft_tfidf_default_options(); + + return is_fuzzy_duplicate(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options, normalize_options, soft_tfidf_options, do_acronyms); +} diff --git a/src/dedupe.h b/src/dedupe.h new file mode 100644 index 00000000..c9a4fdf8 --- /dev/null +++ b/src/dedupe.h @@ -0,0 +1,23 @@ +#ifndef DEDUPE_H +#define DEDUPE_H + +#include +#include + +#include "libpostal.h" +#include "string_utils.h" + +libpostal_duplicate_status_t is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options); + +libpostal_duplicate_status_similarity_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); +libpostal_duplicate_status_similarity_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); + + +#endif \ No newline at end of file From cadf52d19fb9d53edc53b256383e71609574f042 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 29 Dec 2017 04:50:08 -0500 Subject: [PATCH 77/89] [fix] making a few internal functions static --- src/expand.c | 24 ++++++++++++------------ src/near_dupe.c | 2 +- src/place.c | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/expand.c b/src/expand.c index 1c887055..b0d62e3c 100644 --- a/src/expand.c +++ b/src/expand.c @@ -45,7 +45,7 @@ inline uint64_t get_normalize_string_options(libpostal_normalize_options_t optio } -inline size_t valid_ordinal_suffix_len(char *str, token_t token, token_t prev_token, char *lang) { +static inline size_t valid_ordinal_suffix_len(char *str, token_t token, token_t prev_token, char *lang) { size_t len_ordinal_suffix = ordinal_suffix_len(str + token.offset, token.len, lang); int32_t unichr = 0; @@ -495,7 +495,7 @@ bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, } -inline uint32_t gazetteer_ignorable_components(uint16_t dictionary_id) { +static inline uint32_t gazetteer_ignorable_components(uint16_t dictionary_id) { switch (dictionary_id) { case DICTIONARY_ACADEMIC_DEGREE: return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; @@ -553,7 +553,7 @@ inline uint32_t gazetteer_ignorable_components(uint16_t dictionary_id) { } -inline uint32_t gazetteer_valid_components(uint16_t dictionary_id) { +static inline uint32_t gazetteer_valid_components(uint16_t dictionary_id) { switch (dictionary_id) { case DICTIONARY_DIRECTIONAL: return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE; @@ -568,7 +568,7 @@ inline uint32_t gazetteer_valid_components(uint16_t dictionary_id) { } } -inline uint32_t gazetteer_edge_ignorable_components(uint16_t dictionary_id) { +static inline uint32_t gazetteer_edge_ignorable_components(uint16_t dictionary_id) { switch (dictionary_id) { // Pre/post directionals can be removed if there are non-phrase tokens case DICTIONARY_DIRECTIONAL: @@ -582,7 +582,7 @@ inline uint32_t gazetteer_edge_ignorable_components(uint16_t dictionary_id) { } } -inline uint32_t gazetteer_specifier_components(uint16_t dictionary_id) { +static inline uint32_t gazetteer_specifier_components(uint16_t dictionary_id) { switch (dictionary_id) { case DICTIONARY_LEVEL_STANDALONE: return LIBPOSTAL_ADDRESS_LEVEL; @@ -600,7 +600,7 @@ inline uint32_t gazetteer_specifier_components(uint16_t dictionary_id) { } -inline uint32_t gazetteer_possible_root_components(uint16_t dictionary_id) { +static inline uint32_t gazetteer_possible_root_components(uint16_t dictionary_id) { switch (dictionary_id) { case DICTIONARY_ACADEMIC_DEGREE: return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET; @@ -632,7 +632,7 @@ typedef enum { } gazetteer_match_type_t; -inline bool address_expansion_matches_type_for_components(address_expansion_t expansion, uint32_t address_components, gazetteer_match_type_t match_type) { +static inline bool address_expansion_matches_type_for_components(address_expansion_t expansion, uint32_t address_components, gazetteer_match_type_t match_type) { for (uint32_t j = 0; j < expansion.num_dictionaries; j++) { uint16_t dictionary_id = expansion.dictionary_ids[j]; uint32_t components = 0; @@ -662,23 +662,23 @@ inline bool address_expansion_matches_type_for_components(address_expansion_t ex return false; } -inline bool address_expansion_is_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) { +bool address_expansion_is_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) { return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_IGNORABLE); } -inline bool address_expansion_is_edge_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) { +bool address_expansion_is_edge_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) { return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE); } -inline bool address_expansion_is_possible_root_for_components(address_expansion_t expansion, uint32_t address_components) { +bool address_expansion_is_possible_root_for_components(address_expansion_t expansion, uint32_t address_components) { return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT); } -inline bool address_expansion_is_specifier_for_components(address_expansion_t expansion, uint32_t address_components) { +bool address_expansion_is_specifier_for_components(address_expansion_t expansion, uint32_t address_components) { return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_SPECIFIER); } -inline bool address_expansion_is_valid_for_components(address_expansion_t expansion, uint32_t address_components) { +bool address_expansion_is_valid_for_components(address_expansion_t expansion, uint32_t address_components) { return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_VALID_COMPONENTS); } diff --git a/src/near_dupe.c b/src/near_dupe.c index 547d0aba..b649d9d8 100644 --- a/src/near_dupe.c +++ b/src/near_dupe.c @@ -169,7 +169,7 @@ cstring_array *expanded_component_combined(char *input, libpostal_normalize_opti } } -inline cstring_array *expanded_component_root_with_fallback(char *input, libpostal_normalize_options_t options, size_t *n) { +static inline cstring_array *expanded_component_root_with_fallback(char *input, libpostal_normalize_options_t options, size_t *n) { cstring_array *root_expansions = expand_address_root(input, options, n); if (*n > 0) { return root_expansions; diff --git a/src/place.c b/src/place.c index 6f8a03ae..549f1f48 100644 --- a/src/place.c +++ b/src/place.c @@ -1,7 +1,7 @@ #include "place.h" #include "address_parser.h" -inline bool is_address_text_component(char *label) { +static inline bool is_address_text_component(char *label) { return (string_equals(label, ADDRESS_PARSER_LABEL_HOUSE) || string_equals(label, ADDRESS_PARSER_LABEL_ROAD) || string_equals(label, ADDRESS_PARSER_LABEL_METRO_STATION) || From 8495cda1eb8b2534454dce3741220f39bb5f4844 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 29 Dec 2017 13:48:54 -0500 Subject: [PATCH 78/89] [api] adding pairwise-dupe functions/structs to the public header --- src/libpostal.c | 42 ++++++++++++++++++++++++++++++++++++++++++ src/libpostal.h | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/src/libpostal.c b/src/libpostal.c index 03a7dd9d..15f20948 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -8,6 +8,7 @@ #include "address_dictionary.h" #include "address_parser.h" +#include "dedupe.h" #include "expand.h" #include "language_classifier.h" @@ -108,6 +109,47 @@ char **libpostal_place_languages(size_t num_components, char **labels, char **va return languages; } +libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_name_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_street_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_house_number_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_po_box_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_unit_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_floor_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_postal_code_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options) { + return is_toponym_duplicate(num_components1, labels1, values1, num_components2, labels2, values2, options); +} + +libpostal_duplicate_status_similarity_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { + return is_name_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); +} + +libpostal_duplicate_status_similarity_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { + return is_street_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); +} + + void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) { if (self == NULL) return; diff --git a/src/libpostal.h b/src/libpostal.h index 7c3b7e76..25de29c8 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -200,6 +200,47 @@ LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_componen LIBPOSTAL_EXPORT char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages); +// Pairwise dupe methods + +typedef enum { + LIBPOSTAL_NULL_DUPLICATE_STATUS = -1, + LIBPOSTAL_NON_DUPLICATE = 0, + LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW = 3, + LIBPOSTAL_LIKELY_DUPLICATE = 6, + LIBPOSTAL_EXACT_DUPLICATE = 9, +} libpostal_duplicate_status_t; + +typedef struct libpostal_duplicate_options { + size_t num_languages; + char **languages; +} libpostal_duplicate_options_t; + +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options); + +// Pairwise fuzzy dupe methods, return status & similarity + +typedef struct libpostal_duplicate_similarity_options { + size_t num_languages; + char **languages; + double needs_review_threshold; + double likely_dupe_threshold; +} libpostal_duplicate_similarity_options_t; + +typedef struct libpostal_duplicate_status_similarity { + libpostal_duplicate_status_t status; + double similarity; +} libpostal_duplicate_status_similarity_t; + +LIBPOSTAL_EXPORT libpostal_duplicate_status_similarity_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_similarity_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); + // Setup/teardown methods LIBPOSTAL_EXPORT bool libpostal_setup(void); From 53543be5a5e477e33a5e885db6ebf53c56e2188b Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 29 Dec 2017 17:46:35 -0500 Subject: [PATCH 79/89] [build] adding new source files to Makefile for the lieu APIs --- src/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile.am b/src/Makefile.am index 07af51d3..9b5f4887 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -12,7 +12,7 @@ DEFAULT_INCLUDES = -I.. -I/usr/local/include CFLAGS = lib_LTLIBRARIES = libpostal.la -libpostal_la_SOURCES = strndup.c libpostal.c expand.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c place.c near_dupe.c double_metaphone.c geohash/geohash.c +libpostal_la_SOURCES = strndup.c libpostal.c expand.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c utf8proc/utf8proc.c normalize.c numex.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c place.c near_dupe.c double_metaphone.c geohash/geohash.c dedupe.c string_similarity.c acronyms.c soft_tfidf.c jaccard.c libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS) libpostal_la_CFLAGS = $(CFLAGS_O2) -D LIBPOSTAL_EXPORTS libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@ -no-undefined From 6dff154a99ae0c03e5a032697d21c079c3eb8cd2 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 29 Dec 2017 17:48:05 -0500 Subject: [PATCH 80/89] [api] adding APIs for getting default options and using a consistent naming convention --- src/dedupe.c | 8 ++++---- src/dedupe.h | 4 ++-- src/libpostal.c | 45 +++++++++++++++++++++++++++++++++++++++++--- src/libpostal.h | 19 +++++++++++++------ src/near_dupe_test.c | 2 +- 5 files changed, 62 insertions(+), 16 deletions(-) diff --git a/src/dedupe.c b/src/dedupe.c index bbf613a0..94453f26 100644 --- a/src/dedupe.c +++ b/src/dedupe.c @@ -263,7 +263,7 @@ bool have_ideographic_word_tokens(token_array *token_array) { return false; } -libpostal_duplicate_status_similarity_t is_fuzzy_duplicate(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options, libpostal_normalize_options_t normalize_options, soft_tfidf_options_t soft_tfidf_options, bool do_acronyms) { +libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options, libpostal_normalize_options_t normalize_options, soft_tfidf_options_t soft_tfidf_options, bool do_acronyms) { normalize_options.num_languages = options.num_languages; normalize_options.languages = options.languages; @@ -362,10 +362,10 @@ libpostal_duplicate_status_similarity_t is_fuzzy_duplicate(size_t num_tokens1, c free(joined2); } - return (libpostal_duplicate_status_similarity_t){dupe_status, max_sim}; + return (libpostal_fuzzy_duplicate_status_t){dupe_status, max_sim}; } -inline libpostal_duplicate_status_similarity_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { +inline libpostal_fuzzy_duplicate_status_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME; @@ -377,7 +377,7 @@ inline libpostal_duplicate_status_similarity_t is_name_duplicate_fuzzy(size_t nu } -inline libpostal_duplicate_status_similarity_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { +inline libpostal_fuzzy_duplicate_status_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET; diff --git a/src/dedupe.h b/src/dedupe.h index c9a4fdf8..5c40fb8c 100644 --- a/src/dedupe.h +++ b/src/dedupe.h @@ -16,8 +16,8 @@ libpostal_duplicate_status_t is_floor_duplicate(char *value1, char *value2, libp libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options); -libpostal_duplicate_status_similarity_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); -libpostal_duplicate_status_similarity_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); +libpostal_fuzzy_duplicate_status_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); +libpostal_fuzzy_duplicate_status_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); #endif \ No newline at end of file diff --git a/src/libpostal.c b/src/libpostal.c index 15f20948..9ac2dcab 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -79,7 +79,7 @@ static libpostal_near_dupe_hash_options_t LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIO .address_only_keys = false }; -libpostal_near_dupe_hash_options_t libpostal_near_dupe_hash_default_options(void) { +libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void) { return LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS; } @@ -109,6 +109,22 @@ char **libpostal_place_languages(size_t num_components, char **labels, char **va return languages; } +static libpostal_duplicate_options_t LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS = { + .num_languages = 0, + .languages = NULL +}; + +libpostal_duplicate_options_t libpostal_get_default_duplicate_options(void) { + return LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS; +} + +libpostal_duplicate_options_t libpostal_get_duplicate_options_with_languages(size_t num_languages, char **languages) { + libpostal_duplicate_options_t options = LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS; + options.num_languages = num_languages; + options.languages = languages; + return options; +} + libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { return is_name_duplicate(value1, value2, options); } @@ -141,11 +157,34 @@ libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_component return is_toponym_duplicate(num_components1, labels1, values1, num_components2, labels2, values2, options); } -libpostal_duplicate_status_similarity_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { +#define DEFAULT_FUZZY_DUPLICATE_NEEDS_REVIEW_THRESHOLD 0.7 +#define DEFAULT_FUZZY_DUPLICATE_LIKELY_DUPE_THRESHOLD 0.9 + +static libpostal_fuzzy_duplicate_options_t DEFAULT_FUZZY_DUPLICATE_OPTIONS = { + .num_languages = 0, + .languages = NULL, + .needs_review_threshold = DEFAULT_FUZZY_DUPLICATE_NEEDS_REVIEW_THRESHOLD, + .likely_dupe_threshold = DEFAULT_FUZZY_DUPLICATE_LIKELY_DUPE_THRESHOLD +}; + + +libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options(void) { + return DEFAULT_FUZZY_DUPLICATE_OPTIONS; +} + +libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages, char **languages) { + libpostal_fuzzy_duplicate_options_t options = DEFAULT_FUZZY_DUPLICATE_OPTIONS; + options.num_languages = num_languages; + options.languages = languages; + return options; +} + + +libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { return is_name_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); } -libpostal_duplicate_status_similarity_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { +libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { return is_street_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); } diff --git a/src/libpostal.h b/src/libpostal.h index 25de29c8..76aa4ab5 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -192,7 +192,7 @@ typedef struct libpostal_near_dupe_hash_options { } libpostal_near_dupe_hash_options_t; -LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_near_dupe_hash_default_options(void); +LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void); LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes); LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes); @@ -215,6 +215,10 @@ typedef struct libpostal_duplicate_options { char **languages; } libpostal_duplicate_options_t; + +LIBPOSTAL_EXPORT libpostal_duplicate_options_t libpostal_get_default_duplicate_options(void); +LIBPOSTAL_EXPORT libpostal_duplicate_options_t libpostal_get_duplicate_options_with_languages(size_t num_languages, char **languages); + LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); @@ -226,20 +230,23 @@ LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_toponym_duplicate(siz // Pairwise fuzzy dupe methods, return status & similarity -typedef struct libpostal_duplicate_similarity_options { +typedef struct libpostal_fuzzy_duplicate_options { size_t num_languages; char **languages; double needs_review_threshold; double likely_dupe_threshold; -} libpostal_duplicate_similarity_options_t; +} libpostal_fuzzy_duplicate_options_t; typedef struct libpostal_duplicate_status_similarity { libpostal_duplicate_status_t status; double similarity; -} libpostal_duplicate_status_similarity_t; +} libpostal_fuzzy_duplicate_status_t; -LIBPOSTAL_EXPORT libpostal_duplicate_status_similarity_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); -LIBPOSTAL_EXPORT libpostal_duplicate_status_similarity_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); +LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options(void); +LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages, char **languages); + +LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); // Setup/teardown methods diff --git a/src/near_dupe_test.c b/src/near_dupe_test.c index 18155dea..db7d67bf 100644 --- a/src/near_dupe_test.c +++ b/src/near_dupe_test.c @@ -14,7 +14,7 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - libpostal_near_dupe_hash_options_t options = libpostal_near_dupe_hash_default_options(); + libpostal_near_dupe_hash_options_t options = libpostal_get_near_dupe_hash_default_options(); cstring_array *labels_array = cstring_array_new(); cstring_array *values_array = cstring_array_new(); From c48c2b778c0b6fccabdf42989a324b4af5819126 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 30 Dec 2017 02:28:38 -0500 Subject: [PATCH 81/89] [dedupe] fixes to near dupe hashing, geohash lengths, cutting off name hashing at 50 unique tokens, fixing memory leaks, checking for valid geo components and returning NULL if one of the required fields isn't present --- src/near_dupe.c | 52 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/src/near_dupe.c b/src/near_dupe.c index b649d9d8..f2c08280 100644 --- a/src/near_dupe.c +++ b/src/near_dupe.c @@ -185,23 +185,24 @@ static cstring_array *geohash_and_neighbors(double latitude, double longitude, s if (geohash_precision == 0) return NULL; if (geohash_precision > MAX_GEOHASH_PRECISION) geohash_precision = MAX_GEOHASH_PRECISION; + size_t geohash_len = geohash_precision + 1; - char geohash[geohash_precision + 1]; - if (geohash_encode(latitude, longitude, geohash, geohash_precision) != GEOHASH_OK) { + char geohash[geohash_len]; + if (geohash_encode(latitude, longitude, geohash, geohash_len) != GEOHASH_OK) { return NULL; } - size_t neighbors_size = geohash_precision * 8 + 1; + size_t neighbors_size = geohash_len * 8; char neighbors[neighbors_size]; int num_strings = 0; if (geohash_neighbors(geohash, neighbors, neighbors_size, &num_strings) == GEOHASH_OK && num_strings == 8) { - cstring_array *strings = cstring_array_new_size(9 * geohash_precision + 1); + cstring_array *strings = cstring_array_new_size(9 * geohash_len); cstring_array_add_string(strings, geohash); for (int i = 0; i < num_strings; i++) { - char *neighbor = neighbors + geohash_precision * i; + char *neighbor = neighbors + geohash_len * i; cstring_array_add_string(strings, neighbor); } return strings; @@ -210,6 +211,8 @@ static cstring_array *geohash_and_neighbors(double latitude, double longitude, s return NULL; } +#define MAX_NAME_TOKENS 50 + cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normalize_options) { normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_ANY; @@ -276,7 +279,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal k = kh_get(str_set, unique_strings, dm_primary); - if (k == kh_end(unique_strings)) { + if (k == kh_end(unique_strings) && kh_size(unique_strings) <= MAX_NAME_TOKENS) { log_debug("adding dm_primary = %s\n", dm_primary); cstring_array_add_string(strings, dm_primary); k = kh_put(str_set, unique_strings, strdup(dm_primary), &ret); @@ -289,7 +292,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal k = kh_get(str_set, unique_strings, dm_secondary); - if (k == kh_end(unique_strings)) { + if (k == kh_end(unique_strings) && kh_size(unique_strings) <= MAX_NAME_TOKENS) { log_debug("adding dm_secondary = %s\n", dm_secondary); cstring_array_add_string(strings, dm_secondary); k = kh_put(str_set, unique_strings, strdup(dm_secondary), &ret); @@ -327,6 +330,8 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal token_array_destroy(token_array); char_array_destroy(combined_words_no_whitespace); + cstring_array_destroy(name_expansions); + const char *key; kh_foreach_key(unique_strings, key, { @@ -394,11 +399,32 @@ static inline void add_string_hash_permutations(cstring_array *near_dupe_hashes, cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages) { + if (!options.with_latlon && !options.with_city_or_equivalent && !options.with_postal_code) return NULL; + place_t *place = place_from_components(num_components, labels, values); log_debug("created place\n"); if (place == NULL) return NULL; - size_t n = 0; + bool have_valid_geo = options.with_latlon; + + if (!have_valid_geo && options.with_postal_code && place->postal_code != NULL) { + have_valid_geo = true; + } + + if (!have_valid_geo && options.with_city_or_equivalent && (place->city != NULL || place->city_district != NULL || place->suburb != NULL || place->island != NULL)) { + have_valid_geo = true; + } + + if (!have_valid_geo && options.with_small_containing_boundaries && (place->state_district != NULL)) { + have_valid_geo = true; + } + + + if (!have_valid_geo) { + log_debug("no valid geo\n"); + place_destroy(place); + return NULL; + } libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); @@ -413,7 +439,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, normalize_options.languages = lang_response->languages; } } else { - normalize_options.num_languages = languages; + normalize_options.num_languages = num_languages; normalize_options.languages = languages; } @@ -531,7 +557,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, } } - if (place->state_district != NULL) { + if (place->state_district != NULL && options.with_small_containing_boundaries) { size_t num_state_district_expansions = 0; cstring_array *state_district_expansions = expand_address_root(place->state_district, normalize_options, &num_state_district_expansions); if (containing_expansions == NULL) { @@ -560,8 +586,6 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, return NULL; } - bool added = false; - num_name_expansions = name_expansions != NULL ? cstring_array_num_strings(name_expansions) : 0; num_street_expansions = street_expansions != NULL ? cstring_array_num_strings(street_expansions) : 0; num_house_number_expansions = house_number_expansions != NULL ? cstring_array_num_strings(house_number_expansions) : 0; @@ -881,6 +905,10 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, } + if (place != NULL) { + place_destroy(place); + } + if (tree != NULL) { string_tree_destroy(tree); } From 86d5eca521a0f641e035a686f6bdf9d4bf83e755 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 30 Dec 2017 02:31:25 -0500 Subject: [PATCH 82/89] [api] checking for NULL responses in the cstring_array methods before converting them to char arrays --- src/libpostal.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/libpostal.c b/src/libpostal.c index 9ac2dcab..288f42c9 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -49,11 +49,13 @@ libpostal_normalize_options_t libpostal_get_default_options(void) { char **libpostal_expand_address(char *input, libpostal_normalize_options_t options, size_t *n) { cstring_array *strings = expand_address(input, options, n); + if (strings == NULL) return NULL; return cstring_array_to_strings(strings); } char **libpostal_expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) { cstring_array *strings = expand_address_root(input, options, n); + if (strings == NULL) return NULL; return cstring_array_to_strings(strings); } @@ -85,6 +87,10 @@ libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options( char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes) { cstring_array *strings = near_dupe_hashes(num_components, labels, values, options); + if (strings == NULL) { + *num_hashes = 0; + return NULL; + } *num_hashes = cstring_array_num_strings(strings); return cstring_array_to_strings(strings); } @@ -92,6 +98,10 @@ char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **v char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes) { cstring_array *strings = near_dupe_hashes_languages(num_components, labels, values, options, num_languages, languages); + if (strings == NULL) { + *num_hashes = 0; + return NULL; + } *num_hashes = cstring_array_num_strings(strings); return cstring_array_to_strings(strings); } @@ -99,6 +109,10 @@ char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages) { language_classifier_response_t *lang_response = place_languages(num_components, labels, values); + if (lang_response == NULL) { + *num_languages = 0; + return NULL; + } char **languages = lang_response->languages; lang_response->languages = NULL; From 434bbd4dc28c4696d6842dfbf1fdfeb5e8dad517 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 30 Dec 2017 02:31:43 -0500 Subject: [PATCH 83/89] [fix] removing unused vars --- src/soft_tfidf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/soft_tfidf.c b/src/soft_tfidf.c index f5f1800f..3c77be89 100644 --- a/src/soft_tfidf.c +++ b/src/soft_tfidf.c @@ -214,7 +214,6 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char for (size_t i = 0; i < len1; i++) { uint32_array *t1u = t1_tokens_unicode[i]; uint32_array *t2u; - char *t1 = tokens1[i]; double t1_score = token_scores1[i]; double max_sim = 0.0; @@ -237,7 +236,6 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char double t2_score; for (size_t j = 0; j < len2; j++) { - char *t2 = tokens2[j]; t2u = t2_tokens_unicode[j]; int64_t pm2 = phrase_memberships2 != NULL ? phrase_memberships2[j] : NULL_PHRASE_MEMBERSHIP; phrase_t p2 = pm2 >= 0 ? phrases2->a[pm2] : NULL_PHRASE; From 3263c84b321cccf6964a2508fc7e3148dc1db354 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 30 Dec 2017 02:32:35 -0500 Subject: [PATCH 84/89] [api] using uint32_t for geohash precision option --- src/libpostal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libpostal.h b/src/libpostal.h index 76aa4ab5..1b805c2a 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -185,7 +185,7 @@ typedef struct libpostal_near_dupe_hash_options { bool with_latlon; double latitude; double longitude; - size_t geohash_precision; + uint32_t geohash_precision; bool name_and_address_keys; bool name_only_keys; bool address_only_keys; From 668e46796797290d7b94a508753d0cb8cfca6df8 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 30 Dec 2017 02:33:33 -0500 Subject: [PATCH 85/89] [dedupe/test] checking for NULL in near_dupe test program --- src/near_dupe_test.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/near_dupe_test.c b/src/near_dupe_test.c index db7d67bf..a5fa5aa7 100644 --- a/src/near_dupe_test.c +++ b/src/near_dupe_test.c @@ -102,13 +102,15 @@ int main(int argc, char **argv) { size_t num_near_dupe_hashes = 0; char **near_dupe_hashes = libpostal_near_dupe_hashes_languages(num_components, labels, values, options, num_languages, languages, &num_near_dupe_hashes); + if (near_dupe_hashes != NULL) { + for (size_t i = 0; i < num_near_dupe_hashes; i++) { + char *near_dupe_hash = near_dupe_hashes[i]; + printf("%s\n", near_dupe_hash); + } - for (size_t i = 0; i < num_near_dupe_hashes; i++) { - char *near_dupe_hash = near_dupe_hashes[i]; - printf("%s\n", near_dupe_hash); + libpostal_expansion_array_destroy(near_dupe_hashes, num_near_dupe_hashes); } - libpostal_expansion_array_destroy(near_dupe_hashes, num_near_dupe_hashes); libpostal_expansion_array_destroy(labels, num_components); libpostal_expansion_array_destroy(values, num_components); From 34fe7ec305b0c33af0fd75719d40134e084dbab0 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 30 Dec 2017 02:34:06 -0500 Subject: [PATCH 86/89] [expand] adding a few of the address phrase checks to the expand header --- src/expand.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/expand.h b/src/expand.h index 2857f402..70980daa 100644 --- a/src/expand.h +++ b/src/expand.h @@ -42,6 +42,14 @@ bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, toke void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options); + +bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t address_components); +bool address_phrase_is_edge_ignorable_for_components(phrase_t phrase, uint32_t address_components); +bool address_phrase_is_possible_root_for_components(phrase_t phrase, uint32_t address_components); +bool address_phrase_is_specifier_for_components(phrase_t phrase, uint32_t address_components); +bool address_phrase_is_valid_for_components(phrase_t phrase, uint32_t address_components); + + typedef enum { EXPAND_PHRASES, KEEP_PHRASES, From 34c3ee7f7a1e2d41a1282eb94f33e9ce929daaf6 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 30 Dec 2017 03:24:39 -0500 Subject: [PATCH 87/89] [fix] update to struct --- src/libpostal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libpostal.h b/src/libpostal.h index 1b805c2a..5f253566 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -237,7 +237,7 @@ typedef struct libpostal_fuzzy_duplicate_options { double likely_dupe_threshold; } libpostal_fuzzy_duplicate_options_t; -typedef struct libpostal_duplicate_status_similarity { +typedef struct libpostal_fuzzy_duplicate_status { libpostal_duplicate_status_t status; double similarity; } libpostal_fuzzy_duplicate_status_t; From 4e325657469227b854c4552984ae7f999df18fa8 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 30 Dec 2017 18:05:23 -0500 Subject: [PATCH 88/89] [dedupe] fixing toponym matching for city-equivalents, adding the LIBPOSTAL_ADDRESS_ANY component in each function call so it can be removed as needed. --- src/dedupe.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/src/dedupe.c b/src/dedupe.c index 94453f26..30fbe2dd 100644 --- a/src/dedupe.c +++ b/src/dedupe.c @@ -84,8 +84,6 @@ libpostal_duplicate_status_t is_duplicate(char *value1, char *value2, libpostal_ normalize_options.num_languages = options.num_languages; normalize_options.languages = options.languages; - normalize_options.address_components |= LIBPOSTAL_ADDRESS_ANY; - if (root_comparison_first) { if (address_component_equals_root(value1, value2, normalize_options)) { return root_comparison_status; @@ -104,14 +102,14 @@ libpostal_duplicate_status_t is_duplicate(char *value1, char *value2, libpostal_ libpostal_duplicate_status_t is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); - normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME; + normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_ANY; bool root_comparison_first = false; libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); } libpostal_duplicate_status_t is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); - normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET; + normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY; bool root_comparison_first = false; libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); @@ -119,7 +117,7 @@ libpostal_duplicate_status_t is_street_duplicate(char *value1, char *value2, lib libpostal_duplicate_status_t is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); - normalize_options.address_components = LIBPOSTAL_ADDRESS_HOUSE_NUMBER; + normalize_options.address_components = LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY; bool root_comparison_first = true; libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); @@ -127,7 +125,7 @@ libpostal_duplicate_status_t is_house_number_duplicate(char *value1, char *value libpostal_duplicate_status_t is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); - normalize_options.address_components = LIBPOSTAL_ADDRESS_UNIT; + normalize_options.address_components = LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY; bool root_comparison_first = true; libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); @@ -135,7 +133,7 @@ libpostal_duplicate_status_t is_unit_duplicate(char *value1, char *value2, libpo libpostal_duplicate_status_t is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); - normalize_options.address_components = LIBPOSTAL_ADDRESS_LEVEL; + normalize_options.address_components = LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY; bool root_comparison_first = true; libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); @@ -143,7 +141,7 @@ libpostal_duplicate_status_t is_floor_duplicate(char *value1, char *value2, libp libpostal_duplicate_status_t is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); - normalize_options.address_components = LIBPOSTAL_ADDRESS_PO_BOX; + normalize_options.address_components = LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_ANY; bool root_comparison_first = true; libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); @@ -151,7 +149,7 @@ libpostal_duplicate_status_t is_po_box_duplicate(char *value1, char *value2, lib libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); - normalize_options.address_components = LIBPOSTAL_ADDRESS_POSTAL_CODE; + normalize_options.address_components = LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_ANY; bool root_comparison_first = true; libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); @@ -159,7 +157,7 @@ libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2 libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); - normalize_options.address_components = LIBPOSTAL_ADDRESS_TOPONYM; + normalize_options.address_components = LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_ANY; place_t *place1 = place_from_components(num_components1, labels1, values1); place_t *place2 = place_from_components(num_components2, labels2, values2); @@ -169,27 +167,37 @@ libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char * if (place1->city != NULL && place2->city != NULL) { city_match = address_component_equals(place1->city, place2->city, normalize_options); - dupe_status = LIBPOSTAL_EXACT_DUPLICATE; + if (city_match) { + dupe_status = LIBPOSTAL_EXACT_DUPLICATE; + } } if (!city_match && place1->city == NULL && place1->city_district != NULL && place2->city != NULL) { city_match = address_component_equals(place1->city_district, place2->city, normalize_options); - dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + if (city_match) { + dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + } } if (!city_match && place1->city == NULL && place1->suburb != NULL && place2->city != NULL) { city_match = address_component_equals(place1->suburb, place2->city, normalize_options); - dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + if (city_match) { + dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + } } if (!city_match && place2->city == NULL && place2->city_district != NULL && place1->city != NULL) { city_match = address_component_equals(place1->city, place2->city_district, normalize_options); - dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + if (city_match) { + dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + } } if (!city_match && place2->city == NULL && place2->suburb != NULL && place1->city != NULL) { city_match = address_component_equals(place1->suburb, place2->suburb, normalize_options); - dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + if (city_match) { + dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + } } if (!city_match) { @@ -389,3 +397,4 @@ inline libpostal_fuzzy_duplicate_status_t is_street_duplicate_fuzzy(size_t num_t return is_fuzzy_duplicate(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options, normalize_options, soft_tfidf_options, do_acronyms); } + From 3bdb8c86306a9b155f934b20b1168f8631f3ccba Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 31 Dec 2017 13:22:00 -0500 Subject: [PATCH 89/89] [similarity] max out the Jaro-Winkler shared prefix at 4 characters in accordance with Winkler's paper --- src/string_similarity.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/string_similarity.c b/src/string_similarity.c index 61882325..6a16518d 100644 --- a/src/string_similarity.c +++ b/src/string_similarity.c @@ -520,6 +520,8 @@ double jaro_distance(const char *s1, const char *s2) { return jaro; } +#define MAX_JARO_WINKLER_PREFIX 4 + double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold) { double jaro = jaro_distance_unicode(u1_array, u2_array); @@ -533,15 +535,20 @@ double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, ui size_t m = len1 < len2 ? len1 : len2; - size_t i = 0; - for (; i < m; i++) { + size_t shared_prefix = 0; + for (size_t i = 0; i < m; i++) { if (u1[i] != u2[i]) break; + shared_prefix++; + if (shared_prefix > MAX_JARO_WINKLER_PREFIX) { + shared_prefix = MAX_JARO_WINKLER_PREFIX; + break; + } } double jaro_winkler = jaro; if (jaro >= bonus_threshold) { - jaro_winkler += (1.0 - jaro_winkler) * i * prefix_scale; + jaro_winkler += (1.0 - jaro) * shared_prefix * prefix_scale; } return jaro_winkler > 1.0 ? 1.0 : jaro_winkler;