1665 lines
76 KiB
C
1665 lines
76 KiB
C
#include <stdlib.h>
|
|
|
|
#include "expand.h"
|
|
|
|
#include "log/log.h"
|
|
|
|
#include "address_dictionary.h"
|
|
#include "collections.h"
|
|
#include "constants.h"
|
|
#include "language_classifier.h"
|
|
#include "numex.h"
|
|
#include "normalize.h"
|
|
#include "scanner.h"
|
|
#include "string_utils.h"
|
|
#include "token_types.h"
|
|
#include "transliterate.h"
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include <config.h>
|
|
#endif
|
|
|
|
#ifndef HAVE_STRNDUP
|
|
#include "strndup.h"
|
|
#endif
|
|
|
|
|
|
#define DEFAULT_KEY_LEN 32
|
|
|
|
#define EXCESSIVE_PERMUTATIONS 100
|
|
|
|
inline uint64_t get_normalize_token_options(libpostal_normalize_options_t options) {
|
|
uint64_t normalize_token_options = 0;
|
|
|
|
normalize_token_options |= options.delete_final_periods ? NORMALIZE_TOKEN_DELETE_FINAL_PERIOD : 0;
|
|
normalize_token_options |= options.delete_acronym_periods ? NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS : 0;
|
|
normalize_token_options |= options.drop_english_possessives ? NORMALIZE_TOKEN_DROP_ENGLISH_POSSESSIVES : 0;
|
|
normalize_token_options |= options.delete_apostrophes ? NORMALIZE_TOKEN_DELETE_OTHER_APOSTROPHE : 0;
|
|
|
|
return normalize_token_options;
|
|
}
|
|
|
|
inline uint64_t get_normalize_string_options(libpostal_normalize_options_t options) {
|
|
uint64_t normalize_string_options = 0;
|
|
normalize_string_options |= options.transliterate ? NORMALIZE_STRING_TRANSLITERATE : 0;
|
|
normalize_string_options |= options.latin_ascii ? NORMALIZE_STRING_LATIN_ASCII : 0;
|
|
normalize_string_options |= options.decompose ? NORMALIZE_STRING_DECOMPOSE : 0;
|
|
normalize_string_options |= options.strip_accents ? NORMALIZE_STRING_STRIP_ACCENTS : 0;
|
|
normalize_string_options |= options.lowercase ? NORMALIZE_STRING_LOWERCASE : 0;
|
|
normalize_string_options |= options.trim_string ? NORMALIZE_STRING_TRIM : 0;
|
|
normalize_string_options |= options.expand_numex ? NORMALIZE_STRING_REPLACE_NUMEX : 0;
|
|
|
|
return normalize_string_options;
|
|
}
|
|
|
|
void add_normalized_strings_token(cstring_array *strings, char *str, token_t token, libpostal_normalize_options_t options) {
|
|
|
|
uint64_t normalize_token_options = get_normalize_token_options(options);
|
|
|
|
if (token.type != WHITESPACE ) {
|
|
|
|
bool contains_hyphen = string_contains_hyphen_len(str + token.offset, token.len);
|
|
|
|
if (!contains_hyphen || token.type == HYPHEN) {
|
|
log_debug("str = %s, token = {%zu, %zu, %u}\n", str, token.offset, token.len, token.type);
|
|
normalize_token(strings, str, token, normalize_token_options);
|
|
} else if (is_word_token(token.type)) {
|
|
|
|
size_t prefix_hyphen_len = string_hyphen_prefix_len(str + token.offset, token.len);
|
|
if (prefix_hyphen_len > 0) {
|
|
token.offset += prefix_hyphen_len;
|
|
}
|
|
|
|
size_t suffix_hyphen_len = string_hyphen_suffix_len(str + token.offset, token.len);
|
|
if (suffix_hyphen_len > 0) {
|
|
token.len -= suffix_hyphen_len;
|
|
}
|
|
|
|
normalize_token(strings, str, token, normalize_token_options);
|
|
|
|
if (options.replace_word_hyphens) {
|
|
normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS;
|
|
normalize_token(strings, str, token, normalize_token_options);
|
|
normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS;
|
|
}
|
|
|
|
if (options.delete_word_hyphens) {
|
|
normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS;
|
|
normalize_token(strings, str, token, normalize_token_options);
|
|
normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS;
|
|
}
|
|
|
|
} else if (is_numeric_token(token.type)) {
|
|
|
|
normalize_token(strings, str, token, normalize_token_options);
|
|
|
|
if (options.replace_word_hyphens || options.replace_numeric_hyphens) {
|
|
if (options.replace_word_hyphens) {
|
|
normalize_token_options |= NORMALIZE_TOKEN_REPLACE_HYPHENS;
|
|
}
|
|
|
|
if (options.replace_numeric_hyphens) {
|
|
normalize_token_options |= NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS;
|
|
}
|
|
|
|
normalize_token(strings, str, token, normalize_token_options);
|
|
normalize_token_options ^= NORMALIZE_TOKEN_REPLACE_HYPHENS | NORMALIZE_TOKEN_REPLACE_NUMERIC_HYPHENS;
|
|
}
|
|
|
|
if (options.delete_numeric_hyphens) {
|
|
normalize_token_options |= NORMALIZE_TOKEN_DELETE_HYPHENS;
|
|
normalize_token(strings, str, token, normalize_token_options);
|
|
normalize_token_options ^= NORMALIZE_TOKEN_DELETE_HYPHENS;
|
|
}
|
|
}
|
|
|
|
if (is_numeric_token(token.type) && options.split_alpha_from_numeric) {
|
|
bool split_alpha_from_numeric = true;
|
|
|
|
for (size_t i = 0; i < options.num_languages; i++) {
|
|
char *lang = options.languages[i];
|
|
if (valid_ordinal_suffix_len(str, token, NULL_TOKEN, lang) > 1) {
|
|
split_alpha_from_numeric = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (split_alpha_from_numeric) {
|
|
normalize_token_options |= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC;
|
|
normalize_token(strings, str, token, normalize_token_options);
|
|
normalize_token_options ^= NORMALIZE_TOKEN_SPLIT_ALPHA_FROM_NUMERIC;
|
|
}
|
|
}
|
|
} else {
|
|
cstring_array_add_string(strings, " ");
|
|
}
|
|
}
|
|
|
|
void add_postprocessed_string(cstring_array *strings, char *str, libpostal_normalize_options_t options) {
|
|
cstring_array_add_string(strings, str);
|
|
|
|
if (options.roman_numerals) {
|
|
char *numex_replaced = replace_numeric_expressions(str, LATIN_LANGUAGE_CODE);
|
|
if (numex_replaced != NULL) {
|
|
cstring_array_add_string(strings, numex_replaced);
|
|
free(numex_replaced);
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
address_expansion_array *valid_affix_expansions(phrase_t phrase, libpostal_normalize_options_t options) {
|
|
uint32_t expansion_index = phrase.data;
|
|
address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index);
|
|
if (value != NULL && value->components & options.address_components) {
|
|
return value->expansions;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
inline void cat_affix_expansion(char_array *key, char *str, address_expansion_t expansion, token_t token, phrase_t phrase, libpostal_normalize_options_t options) {
|
|
if (expansion.canonical_index != NULL_CANONICAL_INDEX) {
|
|
char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
|
|
uint64_t normalize_string_options = get_normalize_string_options(options);
|
|
char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options);
|
|
canonical = canonical_normalized != NULL ? canonical_normalized : canonical;
|
|
|
|
char_array_cat(key, canonical);
|
|
if (canonical_normalized != NULL) {
|
|
free(canonical_normalized);
|
|
}
|
|
} else {
|
|
char_array_cat_len(key, str + token.offset + phrase.start, phrase.len);
|
|
}
|
|
}
|
|
|
|
|
|
bool add_affix_expansions(string_tree_t *tree, char *str, char *lang, token_t token, phrase_t prefix, phrase_t suffix, libpostal_normalize_options_t options, bool with_period) {
|
|
cstring_array *strings = tree->strings;
|
|
|
|
size_t skip_period = with_period ? 1 : 0;
|
|
|
|
bool have_suffix = suffix.len > 0 && suffix.len < token.len;
|
|
bool have_prefix = prefix.len > 0 && prefix.len + with_period < token.len;
|
|
|
|
if (!have_suffix && !have_prefix) {
|
|
return false;
|
|
}
|
|
|
|
address_expansion_array *prefix_expansions = NULL;
|
|
address_expansion_array *suffix_expansions = NULL;
|
|
|
|
address_expansion_t prefix_expansion;
|
|
address_expansion_t suffix_expansion;
|
|
|
|
char *expansion;
|
|
|
|
size_t num_strings = 0;
|
|
char *root_word = NULL;
|
|
size_t root_len;
|
|
token_t root_token;
|
|
cstring_array *root_strings = NULL;
|
|
int add_space = 0;
|
|
int spaces = 0;
|
|
|
|
size_t prefix_start, prefix_end, root_end, suffix_start;
|
|
|
|
if (have_prefix) {
|
|
prefix_expansions = valid_affix_expansions(prefix, options);
|
|
if (prefix_expansions == NULL) have_prefix = false;
|
|
}
|
|
|
|
if (have_suffix) {
|
|
suffix_expansions = valid_affix_expansions(suffix, options);
|
|
if (suffix_expansions == NULL) have_suffix = false;
|
|
}
|
|
|
|
if (!have_suffix && !have_prefix) {
|
|
return false;
|
|
}
|
|
|
|
char_array *key = char_array_new_size(token.len);
|
|
|
|
if (have_prefix && have_suffix) {
|
|
for (size_t i = 0; i < prefix_expansions->n; i++) {
|
|
prefix_expansion = prefix_expansions->a[i];
|
|
char_array_clear(key);
|
|
|
|
cat_affix_expansion(key, str, prefix_expansion, token, prefix, options);
|
|
prefix_start = key->n - 1;
|
|
|
|
add_space = (int)prefix_expansion.separable || with_period;
|
|
if (prefix.len + skip_period + suffix.len < token.len && !prefix_expansion.separable) {
|
|
add_space = suffix_expansion.separable || with_period;
|
|
}
|
|
|
|
for (spaces = skip_period; spaces <= add_space; spaces++) {
|
|
key->n = prefix_start;
|
|
if (spaces) {
|
|
char_array_cat(key, " ");
|
|
}
|
|
|
|
prefix_end = key->n;
|
|
|
|
if (prefix.len + skip_period + suffix.len < token.len) {
|
|
root_len = token.len - suffix.len - prefix.len - skip_period;
|
|
size_t root_start = token.offset + prefix.len + skip_period;
|
|
size_t prefix_hyphen_len = string_hyphen_prefix_len(str + root_start, root_len);
|
|
root_start += prefix_hyphen_len;
|
|
root_len -= prefix_hyphen_len;
|
|
size_t suffix_hyphen_len = string_hyphen_suffix_len(str + root_start, root_len);
|
|
root_len -= suffix_hyphen_len;
|
|
root_token = (token_t){root_start, root_len, token.type};
|
|
root_strings = cstring_array_new_size(root_len);
|
|
add_normalized_strings_token(root_strings, str, root_token, options);
|
|
num_strings = cstring_array_num_strings(root_strings);
|
|
|
|
for (size_t j = 0; j < num_strings; j++) {
|
|
key->n = prefix_end;
|
|
root_word = cstring_array_get_string(root_strings, j);
|
|
char_array_cat(key, root_word);
|
|
root_end = key->n - 1;
|
|
|
|
for (size_t k = 0; k < suffix_expansions->n; k++) {
|
|
key->n = root_end;
|
|
suffix_expansion = suffix_expansions->a[k];
|
|
|
|
int add_suffix_space = suffix_expansion.separable;
|
|
|
|
suffix_start = key->n;
|
|
for (int suffix_spaces = skip_period; suffix_spaces <= add_suffix_space; suffix_spaces++) {
|
|
key->n = suffix_start;
|
|
if (suffix_spaces) {
|
|
char_array_cat(key, " ");
|
|
}
|
|
|
|
cat_affix_expansion(key, str, suffix_expansion, token, suffix, options);
|
|
|
|
expansion = char_array_get_string(key);
|
|
cstring_array_add_string(strings, expansion);
|
|
|
|
}
|
|
|
|
|
|
}
|
|
}
|
|
|
|
cstring_array_destroy(root_strings);
|
|
root_strings = NULL;
|
|
|
|
} else {
|
|
for (size_t j = 0; j < suffix_expansions->n; j++) {
|
|
key->n = prefix_end - skip_period;
|
|
suffix_expansion = suffix_expansions->a[j];
|
|
|
|
cat_affix_expansion(key, str, suffix_expansion, token, suffix, options);
|
|
|
|
expansion = char_array_get_string(key);
|
|
cstring_array_add_string(tree->strings, expansion);
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
} else if (have_suffix) {
|
|
log_debug("suffix.start=%" PRId32 "\n", suffix.start);
|
|
root_len = suffix.start;
|
|
root_token = (token_t){token.offset, root_len, token.type};
|
|
log_debug("root_len=%zu\n", root_len);
|
|
log_debug("root_token = {%zu, %zu, %u}\n", root_token.offset, root_token.len, root_token.type);
|
|
|
|
root_strings = cstring_array_new_size(root_len + 1);
|
|
add_normalized_strings_token(root_strings, str, root_token, options);
|
|
num_strings = cstring_array_num_strings(root_strings);
|
|
|
|
log_debug("num_strings = %zu\n", num_strings);
|
|
|
|
for (size_t j = 0; j < num_strings; j++) {
|
|
char_array_clear(key);
|
|
root_word = cstring_array_get_string(root_strings, j);
|
|
log_debug("root_word=%s\n", root_word);
|
|
char_array_cat(key, root_word);
|
|
root_end = key->n - 1;
|
|
|
|
for (size_t k = 0; k < suffix_expansions->n; k++) {
|
|
key->n = root_end;
|
|
suffix_expansion = suffix_expansions->a[k];
|
|
|
|
add_space = (suffix_expansion.separable || with_period) && suffix.len < token.len;
|
|
suffix_start = key->n;
|
|
|
|
for (int spaces = skip_period; spaces <= add_space; spaces++) {
|
|
key->n = suffix_start;
|
|
if (spaces) {
|
|
char_array_cat(key, " ");
|
|
}
|
|
|
|
cat_affix_expansion(key, str, suffix_expansion, token, suffix, options);
|
|
|
|
expansion = char_array_get_string(key);
|
|
cstring_array_add_string(tree->strings, expansion);
|
|
}
|
|
}
|
|
}
|
|
} else if (have_prefix) {
|
|
if (prefix.len + skip_period <= token.len) {
|
|
root_len = token.len - prefix.len - skip_period;
|
|
size_t root_start = token.offset + prefix.len + skip_period;
|
|
size_t prefix_hyphen_len = string_hyphen_prefix_len(str + root_start, root_len);
|
|
root_start += prefix_hyphen_len;
|
|
root_len -= prefix_hyphen_len;
|
|
size_t suffix_hyphen_len = string_hyphen_suffix_len(str + root_start, root_len);
|
|
root_len -= suffix_hyphen_len;
|
|
root_token = (token_t){root_start, root_len, token.type};
|
|
root_strings = cstring_array_new_size(root_len);
|
|
add_normalized_strings_token(root_strings, str, root_token, options);
|
|
num_strings = cstring_array_num_strings(root_strings);
|
|
|
|
} else {
|
|
root_strings = cstring_array_new_size(token.len);
|
|
add_normalized_strings_token(root_strings, str, token, options);
|
|
num_strings = cstring_array_num_strings(root_strings);
|
|
|
|
for (size_t k = 0; k < num_strings; k++) {
|
|
root_word = cstring_array_get_string(root_strings, k);
|
|
cstring_array_add_string(tree->strings, root_word);
|
|
}
|
|
|
|
char_array_destroy(key);
|
|
cstring_array_destroy(root_strings);
|
|
return false;
|
|
|
|
}
|
|
|
|
for (size_t j = 0; j < prefix_expansions->n; j++) {
|
|
char_array_clear(key);
|
|
prefix_expansion = prefix_expansions->a[j];
|
|
|
|
cat_affix_expansion(key, str, prefix_expansion, token, prefix, options);
|
|
prefix_end = key->n - 1;
|
|
|
|
add_space = (prefix_expansion.separable || with_period) && prefix.len + skip_period < token.len;
|
|
for (int spaces = skip_period; spaces <= add_space; spaces++) {
|
|
key->n = prefix_end;
|
|
if (spaces) {
|
|
char_array_cat(key, " ");
|
|
}
|
|
size_t prefix_space_len = key->n - spaces;
|
|
for (size_t k = 0; k < num_strings; k++) {
|
|
key->n = prefix_space_len;
|
|
root_word = cstring_array_get_string(root_strings, k);
|
|
char_array_cat(key, root_word);
|
|
|
|
expansion = char_array_get_string(key);
|
|
cstring_array_add_string(tree->strings, expansion);
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
char_array_destroy(key);
|
|
|
|
if (root_strings != NULL) {
|
|
cstring_array_destroy(root_strings);
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
inline bool expand_affixes(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) {
|
|
phrase_t suffix = search_address_dictionaries_suffix(str + token.offset, token.len, lang);
|
|
|
|
phrase_t prefix = search_address_dictionaries_prefix(str + token.offset, token.len, lang);
|
|
|
|
if ((suffix.len == 0 && prefix.len == 0)) return false;
|
|
|
|
bool with_period = false;
|
|
|
|
return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period);
|
|
}
|
|
|
|
inline bool expand_affixes_period(string_tree_t *tree, char *str, char *lang, token_t token, libpostal_normalize_options_t options) {
|
|
ssize_t first_period_index = string_next_period_len(str + token.offset, token.len);
|
|
if (first_period_index > 0) {
|
|
ssize_t next_period_index = string_next_period_len(str + token.offset + first_period_index + 1, token.len - first_period_index - 1);
|
|
// Token contains only one period or one + a final period
|
|
if (next_period_index < 0 || next_period_index == token.len - 1) {
|
|
phrase_t prefix = search_address_dictionaries_substring(str + token.offset, first_period_index, lang);
|
|
|
|
phrase_t suffix = search_address_dictionaries_substring(str + token.offset + first_period_index + 1, token.len - first_period_index - 1, lang);
|
|
if (suffix.len > 0) {
|
|
suffix.start = first_period_index + 1;
|
|
}
|
|
|
|
if (suffix.len == 0 && prefix.len == 0) return false;
|
|
|
|
bool with_period = true;
|
|
|
|
return add_affix_expansions(tree, str, lang, token, prefix, suffix, options, with_period);
|
|
} else {
|
|
return false;
|
|
}
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool add_period_affixes_or_token(string_tree_t *tree, char *str, token_t token, libpostal_normalize_options_t options) {
|
|
bool have_period_affixes = false;
|
|
if (string_contains_period_len(str + token.offset, token.len)) {
|
|
for (size_t l = 0; l < options.num_languages; l++) {
|
|
char *lang = options.languages[l];
|
|
if (expand_affixes_period(tree, str, lang, token, options)) {
|
|
have_period_affixes = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!have_period_affixes) {
|
|
string_tree_add_string_len(tree, str + token.offset, token.len);
|
|
}
|
|
|
|
return have_period_affixes;
|
|
}
|
|
|
|
|
|
static inline uint32_t gazetteer_ignorable_components(uint16_t dictionary_id) {
|
|
switch (dictionary_id) {
|
|
case DICTIONARY_ACADEMIC_DEGREE:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_BUILDING_TYPE:
|
|
return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_UNIT;
|
|
case DICTIONARY_COMPANY_TYPE:
|
|
return LIBPOSTAL_ADDRESS_NAME;
|
|
case DICTIONARY_DIRECTIONAL:
|
|
return LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_ELISION:
|
|
return LIBPOSTAL_ADDRESS_ANY;
|
|
case DICTIONARY_ENTRANCE:
|
|
return LIBPOSTAL_ADDRESS_ENTRANCE;
|
|
case DICTIONARY_HOUSE_NUMBER:
|
|
return LIBPOSTAL_ADDRESS_HOUSE_NUMBER;
|
|
case DICTIONARY_LEVEL_NUMBERED:
|
|
return LIBPOSTAL_ADDRESS_LEVEL;
|
|
case DICTIONARY_LEVEL_STANDALONE:
|
|
return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY);
|
|
case DICTIONARY_LEVEL_MEZZANINE:
|
|
return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL| LIBPOSTAL_ADDRESS_ANY);
|
|
case DICTIONARY_LEVEL_BASEMENT:
|
|
return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY);
|
|
case DICTIONARY_LEVEL_SUB_BASEMENT:
|
|
return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY);
|
|
case DICTIONARY_NUMBER:
|
|
return LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_NO_NUMBER:
|
|
return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY);
|
|
case DICTIONARY_PERSONAL_TITLE:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_PLACE_NAME:
|
|
return LIBPOSTAL_ADDRESS_NAME;
|
|
case DICTIONARY_POST_OFFICE:
|
|
return LIBPOSTAL_ADDRESS_PO_BOX;
|
|
case DICTIONARY_POSTAL_CODE:
|
|
return LIBPOSTAL_ADDRESS_POSTAL_CODE;
|
|
case DICTIONARY_QUALIFIER:
|
|
return LIBPOSTAL_ADDRESS_TOPONYM;
|
|
case DICTIONARY_STAIRCASE:
|
|
return LIBPOSTAL_ADDRESS_STAIRCASE;
|
|
case DICTIONARY_STOPWORD:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM;
|
|
case DICTIONARY_STREET_TYPE:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_UNIT_NUMBERED:
|
|
return LIBPOSTAL_ADDRESS_UNIT;
|
|
case DICTIONARY_UNIT_STANDALONE:
|
|
return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY);
|
|
case DICTIONARY_UNIT_DIRECTION:
|
|
return LIBPOSTAL_ADDRESS_ALL ^ (LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY);
|
|
default:
|
|
return LIBPOSTAL_ADDRESS_NONE;
|
|
}
|
|
}
|
|
|
|
|
|
static inline uint32_t gazetteer_valid_components(uint16_t dictionary_id) {
|
|
switch (dictionary_id) {
|
|
case DICTIONARY_DIRECTIONAL:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE;
|
|
case DICTIONARY_STOPWORD:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM;
|
|
case DICTIONARY_STREET_NAME:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_STREET_TYPE:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_SYNONYM:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_CATEGORY | LIBPOSTAL_ADDRESS_NEAR | LIBPOSTAL_ADDRESS_TOPONYM;
|
|
default:
|
|
return LIBPOSTAL_ADDRESS_NONE;
|
|
}
|
|
}
|
|
|
|
static inline uint32_t gazetteer_edge_ignorable_components(uint16_t dictionary_id) {
|
|
switch (dictionary_id) {
|
|
// Pre/post directionals can be removed if there are non-phrase tokens
|
|
case DICTIONARY_DIRECTIONAL:
|
|
return LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_COMPANY_TYPE:
|
|
return LIBPOSTAL_ADDRESS_NAME;
|
|
case DICTIONARY_PLACE_NAME:
|
|
return LIBPOSTAL_ADDRESS_NAME;
|
|
default:
|
|
return LIBPOSTAL_ADDRESS_NONE;
|
|
}
|
|
}
|
|
|
|
static inline uint32_t gazetteer_specifier_components(uint16_t dictionary_id) {
|
|
switch (dictionary_id) {
|
|
case DICTIONARY_LEVEL_STANDALONE:
|
|
return LIBPOSTAL_ADDRESS_LEVEL;
|
|
case DICTIONARY_LEVEL_MEZZANINE:
|
|
return LIBPOSTAL_ADDRESS_LEVEL;
|
|
case DICTIONARY_LEVEL_BASEMENT:
|
|
return LIBPOSTAL_ADDRESS_LEVEL;
|
|
case DICTIONARY_LEVEL_SUB_BASEMENT:
|
|
return LIBPOSTAL_ADDRESS_LEVEL;
|
|
case DICTIONARY_UNIT_STANDALONE:
|
|
return LIBPOSTAL_ADDRESS_UNIT;
|
|
default:
|
|
return LIBPOSTAL_ADDRESS_NONE;
|
|
}
|
|
}
|
|
|
|
|
|
static inline uint32_t gazetteer_possible_root_components(uint16_t dictionary_id) {
|
|
switch (dictionary_id) {
|
|
case DICTIONARY_ACADEMIC_DEGREE:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_DIRECTIONAL:
|
|
return LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_PERSONAL_TITLE:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_NUMBER:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_PLACE_NAME:
|
|
return LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_QUALIFIER:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_STREET_NAME:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_SYNONYM:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
|
|
case DICTIONARY_TOPONYM:
|
|
return LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_STREET;
|
|
default:
|
|
return LIBPOSTAL_ADDRESS_NONE;
|
|
}
|
|
}
|
|
|
|
static const uint16_t NUMERIC_ADDRESS_COMPONENTS = (LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_STAIRCASE | LIBPOSTAL_ADDRESS_ENTRANCE | LIBPOSTAL_ADDRESS_STREET);
|
|
|
|
typedef enum {
|
|
GAZETTEER_MATCH_IGNORABLE,
|
|
GAZETTEER_MATCH_EDGE_IGNORABLE,
|
|
GAZETTEER_MATCH_POSSIBLE_ROOT,
|
|
GAZETTEER_MATCH_SPECIFIER,
|
|
GAZETTEER_MATCH_VALID_COMPONENTS
|
|
} gazetteer_match_type_t;
|
|
|
|
|
|
static inline bool address_expansion_matches_type_for_components(address_expansion_t expansion, uint32_t address_components, gazetteer_match_type_t match_type) {
|
|
for (uint32_t j = 0; j < expansion.num_dictionaries; j++) {
|
|
uint16_t dictionary_id = expansion.dictionary_ids[j];
|
|
uint32_t components = 0;
|
|
switch (match_type) {
|
|
case GAZETTEER_MATCH_IGNORABLE:
|
|
components = gazetteer_ignorable_components(dictionary_id);
|
|
break;
|
|
case GAZETTEER_MATCH_EDGE_IGNORABLE:
|
|
components = gazetteer_edge_ignorable_components(dictionary_id);
|
|
break;
|
|
case GAZETTEER_MATCH_POSSIBLE_ROOT:
|
|
components = gazetteer_possible_root_components(dictionary_id);
|
|
break;
|
|
case GAZETTEER_MATCH_SPECIFIER:
|
|
components = gazetteer_specifier_components(dictionary_id);
|
|
break;
|
|
case GAZETTEER_MATCH_VALID_COMPONENTS:
|
|
components = gazetteer_valid_components(dictionary_id);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
if (components & address_components) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool address_expansion_is_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) {
|
|
return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_IGNORABLE);
|
|
}
|
|
|
|
bool address_expansion_is_edge_ignorable_for_components(address_expansion_t expansion, uint32_t address_components) {
|
|
return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE);
|
|
}
|
|
|
|
bool address_expansion_is_possible_root_for_components(address_expansion_t expansion, uint32_t address_components) {
|
|
return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT);
|
|
}
|
|
|
|
bool address_expansion_is_specifier_for_components(address_expansion_t expansion, uint32_t address_components) {
|
|
return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_SPECIFIER);
|
|
}
|
|
|
|
bool address_expansion_is_valid_for_components(address_expansion_t expansion, uint32_t address_components) {
|
|
return address_expansion_matches_type_for_components(expansion, address_components, GAZETTEER_MATCH_VALID_COMPONENTS);
|
|
}
|
|
|
|
|
|
bool address_phrase_matches_type_for_components(phrase_t phrase, uint32_t address_components, gazetteer_match_type_t match_type) {
|
|
uint32_t expansion_index = phrase.data;
|
|
address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index);
|
|
|
|
if (value == NULL) return false;
|
|
|
|
address_expansion_array *expansions = value->expansions;
|
|
if (expansions == NULL) return false;
|
|
|
|
for (size_t i = 0; i < expansions->n; i++) {
|
|
address_expansion_t expansion = expansions->a[i];
|
|
|
|
if (address_expansion_matches_type_for_components(expansion, address_components, match_type)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
inline bool address_phrase_is_ignorable_for_components(phrase_t phrase, uint32_t address_components) {
|
|
return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_IGNORABLE);
|
|
}
|
|
|
|
inline bool address_phrase_is_edge_ignorable_for_components(phrase_t phrase, uint32_t address_components) {
|
|
return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_EDGE_IGNORABLE);
|
|
}
|
|
|
|
|
|
inline bool address_phrase_is_possible_root_for_components(phrase_t phrase, uint32_t address_components) {
|
|
return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_POSSIBLE_ROOT);
|
|
}
|
|
|
|
inline bool address_phrase_is_specifier_for_components(phrase_t phrase, uint32_t address_components) {
|
|
return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_SPECIFIER);
|
|
}
|
|
|
|
inline bool address_phrase_is_valid_for_components(phrase_t phrase, uint32_t address_components) {
|
|
return address_phrase_matches_type_for_components(phrase, address_components, GAZETTEER_MATCH_VALID_COMPONENTS);
|
|
}
|
|
|
|
|
|
bool address_phrase_contains_unambiguous_expansion(phrase_t phrase) {
|
|
address_expansion_value_t *value = address_dictionary_get_expansions(phrase.data);
|
|
if (value == NULL) return false;
|
|
|
|
address_expansion_array *expansions = value->expansions;
|
|
if (expansions == NULL) return false;
|
|
|
|
address_expansion_t *expansions_array = expansions->a;
|
|
|
|
for (size_t i = 0; i < expansions->n; i++) {
|
|
address_expansion_t expansion = expansions_array[i];
|
|
if (!address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION)) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
string_tree_t *add_string_alternatives_phrase_option(char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) {
|
|
char_array *key = NULL;
|
|
|
|
log_debug("input=%s\n", str);
|
|
token_array *token_array = tokenize_keep_whitespace(str);
|
|
|
|
if (token_array == NULL) {
|
|
return NULL;
|
|
}
|
|
|
|
size_t len = strlen(str);
|
|
|
|
token_t *tokens = token_array->a;
|
|
size_t num_tokens = token_array->n;
|
|
|
|
log_debug("tokenized, num tokens=%zu\n", num_tokens);
|
|
|
|
bool last_was_punctuation = false;
|
|
|
|
phrase_language_array *phrases = NULL;
|
|
phrase_array *lang_phrases = NULL;
|
|
|
|
for (size_t i = 0; i < options.num_languages; i++) {
|
|
char *lang = options.languages[i];
|
|
log_debug("lang=%s\n", lang);
|
|
|
|
lang_phrases = search_address_dictionaries_tokens(str, token_array, lang);
|
|
|
|
if (lang_phrases == NULL) {
|
|
log_debug("lang_phrases NULL\n");
|
|
continue;
|
|
}
|
|
|
|
log_debug("lang_phrases->n = %zu\n", lang_phrases->n);
|
|
|
|
phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n);
|
|
|
|
for (size_t j = 0; j < lang_phrases->n; j++) {
|
|
phrase_t p = lang_phrases->a[j];
|
|
log_debug("lang=%s, (%d, %d)\n", lang, p.start, p.len);
|
|
phrase_language_array_push(phrases, (phrase_language_t){lang, p});
|
|
}
|
|
|
|
phrase_array_destroy(lang_phrases);
|
|
}
|
|
|
|
|
|
lang_phrases = search_address_dictionaries_tokens(str, token_array, ALL_LANGUAGES);
|
|
if (lang_phrases != NULL) {
|
|
phrases = phrases != NULL ? phrases : phrase_language_array_new_size(lang_phrases->n);
|
|
|
|
for (size_t j = 0; j < lang_phrases->n; j++) {
|
|
phrase_t p = lang_phrases->a[j];
|
|
phrase_language_array_push(phrases, (phrase_language_t){ALL_LANGUAGES, p});
|
|
}
|
|
phrase_array_destroy(lang_phrases);
|
|
|
|
}
|
|
|
|
string_tree_t *tree = string_tree_new_size(len);
|
|
|
|
bool last_added_was_whitespace = false;
|
|
|
|
uint64_t normalize_string_options = get_normalize_string_options(options);
|
|
|
|
if (phrases != NULL) {
|
|
log_debug("phrases not NULL, n=%zu\n", phrases->n);
|
|
ks_introsort(phrase_language_array, phrases->n, phrases->a);
|
|
|
|
phrase_language_t phrase_lang;
|
|
|
|
size_t start = 0;
|
|
size_t end = 0;
|
|
|
|
phrase_t phrase = NULL_PHRASE;
|
|
phrase_t prev_phrase = NULL_PHRASE;
|
|
|
|
key = key != NULL ? key : char_array_new_size(DEFAULT_KEY_LEN);
|
|
|
|
log_debug("phrase_option = %d\n", phrase_option);
|
|
|
|
bool delete_phrases = phrase_option == DELETE_PHRASES;
|
|
bool expand_phrases = phrase_option == EXPAND_PHRASES;
|
|
|
|
size_t num_phrases = phrases->n;
|
|
|
|
bool have_non_phrase_tokens = false;
|
|
bool have_non_phrase_word_tokens = false;
|
|
bool have_canonical_phrases = false;
|
|
bool have_ambiguous = false;
|
|
bool have_possible_root = false;
|
|
bool have_strictly_ignorable = false;
|
|
bool have_strictly_ignorable_abbreviation = false;
|
|
|
|
size_t prev_phrase_end = 0;
|
|
|
|
if (delete_phrases) {
|
|
for (size_t i = 0; i < num_phrases; i++) {
|
|
phrase_lang = phrases->a[i];
|
|
phrase = phrase_lang.phrase;
|
|
|
|
log_debug("phrase.start = %zu, prev_phrase_end = %zu\n", phrase.start, prev_phrase_end);
|
|
|
|
token_t inter_token;
|
|
if (phrase.start > prev_phrase_end) {
|
|
for (size_t j = prev_phrase_end; j < phrase.start; j++) {
|
|
inter_token = tokens[j];
|
|
if (!is_punctuation(inter_token.type) && !is_whitespace(inter_token.type)) {
|
|
log_debug("have_non_phrase_tokens\n");
|
|
have_non_phrase_tokens = true;
|
|
have_non_phrase_word_tokens = have_non_phrase_word_tokens || is_word_token(inter_token.type);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (i == num_phrases - 1 && phrase.start + phrase.len < num_tokens) {
|
|
for (size_t j = phrase.start + phrase.len; j < num_tokens; j++) {
|
|
inter_token = tokens[j];
|
|
if (!is_punctuation(inter_token.type) && !is_whitespace(inter_token.type)) {
|
|
have_non_phrase_tokens = true;
|
|
have_non_phrase_word_tokens = have_non_phrase_word_tokens || is_word_token(inter_token.type);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool phrase_is_ambiguous = address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION);
|
|
bool phrase_is_strictly_ignorable = address_phrase_is_ignorable_for_components(phrase, options.address_components) && !phrase_is_ambiguous;
|
|
bool phrase_is_canonical = address_phrase_has_canonical_interpretation(phrase);
|
|
|
|
have_non_phrase_tokens = have_non_phrase_tokens || (!phrase_is_strictly_ignorable && !phrase_is_ambiguous);
|
|
log_debug("have_non_phrase_word_tokens = %d, phrase_is_strictly_ignorable = %d, phrase_is_ambiguous = %d\n", have_non_phrase_word_tokens, phrase_is_strictly_ignorable, phrase_is_ambiguous);
|
|
if (!have_non_phrase_word_tokens && !phrase_is_strictly_ignorable && !phrase_is_ambiguous) {
|
|
for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
|
|
token_t pt = tokens[j];
|
|
if (is_word_token(pt.type)) {
|
|
log_debug("have_non_phrase_word_tokens\n");
|
|
have_non_phrase_word_tokens = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
have_strictly_ignorable = have_strictly_ignorable || phrase_is_strictly_ignorable;
|
|
have_strictly_ignorable_abbreviation = have_strictly_ignorable_abbreviation || (phrase_is_strictly_ignorable && !phrase_is_canonical);
|
|
if (have_strictly_ignorable_abbreviation) {
|
|
log_debug("have_strictly_ignorable=%zu, phrase_is_canonical=%zu\n", have_strictly_ignorable, phrase_is_canonical);
|
|
}
|
|
|
|
have_possible_root = have_possible_root | address_phrase_is_possible_root_for_components(phrase, options.address_components);
|
|
|
|
have_canonical_phrases = have_canonical_phrases || (phrase_is_canonical && !phrase_is_ambiguous);
|
|
have_ambiguous = have_ambiguous || phrase_is_ambiguous;
|
|
|
|
prev_phrase_end = phrase.start + phrase.len;
|
|
}
|
|
|
|
|
|
log_debug("have_non_phrase_tokens = %d\n", have_non_phrase_tokens);
|
|
log_debug("have_canonical_phrases = %d\n", have_canonical_phrases);
|
|
log_debug("have_ambiguous = %d\n", have_ambiguous);
|
|
log_debug("have_strictly_ignorable = %d\n", have_strictly_ignorable);
|
|
log_debug("have_strictly_ignorable_abbreviation = %d\n", have_strictly_ignorable_abbreviation);
|
|
|
|
}
|
|
|
|
bool skipped_last_edge_phrase = false;
|
|
|
|
for (size_t i = 0; i < phrases->n; i++) {
|
|
phrase_lang = phrases->a[i];
|
|
|
|
phrase = phrase_lang.phrase;
|
|
|
|
log_debug("phrase.start=%d, phrase.len=%d, lang=%s, prev_phrase.start=%d, prev_phrase.len=%d\n", phrase.start, phrase.len, phrase_lang.language, prev_phrase.start, prev_phrase.len);
|
|
|
|
if ((phrase.start > prev_phrase.start && phrase.start < prev_phrase.start + prev_phrase.len) || (phrase.start == prev_phrase.start && i > 0 && phrase.len < prev_phrase.len)) {
|
|
log_debug("continuing\n");
|
|
continue;
|
|
}
|
|
|
|
char_array_clear(key);
|
|
|
|
char_array_cat(key, phrase_lang.language);
|
|
char_array_cat(key, NAMESPACE_SEPARATOR_CHAR);
|
|
|
|
size_t namespace_len = key->n;
|
|
|
|
end = phrase.start;
|
|
|
|
log_debug("start=%zu, end=%zu\n", start, end);
|
|
for (size_t j = start; j < end; j++) {
|
|
log_debug("Adding token %zu\n", j);
|
|
token_t token = tokens[j];
|
|
if (is_punctuation(token.type)) {
|
|
last_was_punctuation = true;
|
|
continue;
|
|
}
|
|
|
|
if (token.type != WHITESPACE) {
|
|
if ((phrase.start > 0 && last_was_punctuation) || (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) || (prev_phrase.start == phrase.start && prev_phrase.len == phrase.len) ) {
|
|
log_debug("Adding space\n");
|
|
string_tree_add_string(tree, " ");
|
|
string_tree_finalize_token(tree);
|
|
}
|
|
log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
|
|
|
|
bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options);
|
|
string_tree_finalize_token(tree);
|
|
last_added_was_whitespace = false;
|
|
} else if (!delete_phrases && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 ) {
|
|
log_debug("Adding pre-phrase whitespace\n");
|
|
last_added_was_whitespace = true;
|
|
string_tree_add_string(tree, " ");
|
|
string_tree_finalize_token(tree);
|
|
} else {
|
|
continue;
|
|
}
|
|
|
|
last_was_punctuation = false;
|
|
}
|
|
|
|
size_t added_expansions = 0;
|
|
token_t token;
|
|
|
|
uint32_t expansion_index = phrase.data;
|
|
address_expansion_value_t *value = address_dictionary_get_expansions(expansion_index);
|
|
|
|
bool expansion_valid_components = (value->components & options.address_components) || address_phrase_is_valid_for_components(phrase, options.address_components);
|
|
|
|
bool is_numeric_component = (value->components & options.address_components & NUMERIC_ADDRESS_COMPONENTS);
|
|
|
|
if (expansion_valid_components) {
|
|
key->n = namespace_len;
|
|
for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
|
|
token = tokens[j];
|
|
if (token.type != WHITESPACE) {
|
|
char_array_cat_len(key, str + token.offset, token.len);
|
|
last_added_was_whitespace = false;
|
|
} else if (!last_added_was_whitespace) {
|
|
char_array_cat(key, " ");
|
|
last_added_was_whitespace = true;
|
|
}
|
|
}
|
|
|
|
char *key_str = char_array_get_string(key);
|
|
log_debug("key_str=%s\n", key_str);
|
|
address_expansion_array *expansions = value->expansions;
|
|
|
|
if (expansions != NULL) {
|
|
bool current_phrase_have_ambiguous = delete_phrases && address_phrase_in_dictionary(phrase, DICTIONARY_AMBIGUOUS_EXPANSION);
|
|
bool added_pre_phrase_space = false;
|
|
bool current_phrase_have_ignorable = delete_phrases && address_phrase_is_ignorable_for_components(phrase, options.address_components);
|
|
bool current_phrase_have_edge_ignorable = false;
|
|
|
|
bool current_phrase_have_specifier = delete_phrases && address_phrase_is_specifier_for_components(phrase, options.address_components);
|
|
bool current_phrase_have_canonical = delete_phrases && address_phrase_has_canonical_interpretation(phrase);
|
|
bool current_phrase_have_possible_root = delete_phrases && address_phrase_is_possible_root_for_components(phrase, options.address_components);
|
|
|
|
bool current_phrase_have_valid = address_phrase_is_valid_for_components(phrase, options.address_components);
|
|
|
|
log_debug("current_phrase_have_specifier = %d\n", current_phrase_have_specifier);
|
|
|
|
bool current_phrase_have_unambiguous = delete_phrases && address_phrase_contains_unambiguous_expansion(phrase);
|
|
|
|
/*
|
|
Edge phrase handling. This is primarily for handling pre-directionals/post-directionals
|
|
in English and other languages.
|
|
*/
|
|
bool skip_edge_phrase = false;
|
|
bool other_phrase_is_ignorable = false;
|
|
|
|
if (delete_phrases) {
|
|
phrase_language_t other_phrase_lang;
|
|
phrase_t other_phrase;
|
|
|
|
log_debug("i = %zu, phrase.start = %u\n", i, phrase.start);
|
|
if (i == 0 && phrase.start == 0 && phrase.start + phrase.len < num_tokens) {
|
|
current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(phrase, options.address_components);
|
|
// Delete "E" in "E 125th St"
|
|
if (current_phrase_have_edge_ignorable) {
|
|
log_debug("edge-ignorable phrase [%u, %u]\n", phrase.start, phrase.start + phrase.len);
|
|
skip_edge_phrase = true;
|
|
}
|
|
|
|
if (!skip_edge_phrase || !have_non_phrase_tokens) {
|
|
for (size_t other_i = i + 1; other_i < phrases->n; other_i++) {
|
|
other_phrase_lang = phrases->a[other_i];
|
|
other_phrase = other_phrase_lang.phrase;
|
|
log_debug("phrase.start + phrase.len = %u\n", phrase.start + phrase.len);
|
|
log_debug("other_phrase.start = %u, other_phrase.len = %u, lang=%s\n", other_phrase.start, other_phrase.len, other_phrase_lang.language);
|
|
if (other_phrase.start >= phrase.start + phrase.len && string_equals(other_phrase_lang.language, phrase_lang.language)) {
|
|
if (other_phrase.start + other_phrase.len == num_tokens) {
|
|
skip_edge_phrase = false;
|
|
if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) {
|
|
// don't delete the "E" in "E St"
|
|
log_debug("initial phrase is edge ignorable out of two phrases. Checking next phrase is ignorable.\n");
|
|
|
|
skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components)));
|
|
log_debug("skip_edge_phrase = %d\n", skip_edge_phrase);
|
|
} else {
|
|
log_debug("initial phrase is not edge-ignorable out of two phrases. Checking next phrase is edge ignorable.\n");
|
|
// delete "Avenue" in "Avenue E"
|
|
other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase));
|
|
skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
|
|
|
|
}
|
|
} else {
|
|
// If we encounter an ignorable phrase like St and we're _not_ the end of the string e.g. "E St SE", the first token is probably a legit token instead of a pre-directional
|
|
skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !((address_phrase_has_canonical_interpretation(other_phrase) || address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components)) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components)));
|
|
log_debug("phrase is possible root. skip_edge_phrase = %d\n", skip_edge_phrase);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} else if (phrases->n > 1 && i == phrases->n - 1 && phrase.start + phrase.len == num_tokens && phrase.start > 0) {
|
|
current_phrase_have_edge_ignorable = address_phrase_is_edge_ignorable_for_components(phrase, options.address_components);
|
|
if (current_phrase_have_edge_ignorable) {
|
|
log_debug("edge-ignorable phrase [%u, %u]\n", phrase.start, phrase.start + phrase.len);
|
|
skip_edge_phrase = true;
|
|
}
|
|
|
|
log_debug("have_non_phrase_tokens = %d\n", have_non_phrase_tokens);
|
|
if (!skip_edge_phrase || !have_non_phrase_tokens) {
|
|
for (ssize_t other_j = i - 1; other_j >= 0; other_j--) {
|
|
other_phrase_lang = phrases->a[other_j];
|
|
other_phrase = other_phrase_lang.phrase;
|
|
log_debug("phrase.start + phrase.len = %u\n", phrase.start + phrase.len);
|
|
log_debug("other_phrase.start = %u, other_phrase.len = %u, lang=%s\n", other_phrase.start, other_phrase.len, other_phrase_lang.language);
|
|
if (other_phrase.start + other_phrase.len <= phrase.start && string_equals(other_phrase_lang.language, phrase_lang.language)) {
|
|
if (other_phrase.start == 0) {
|
|
//other_phrase_invalid = address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !address_phrase_has_canonical_interpretation(other_phrase) && !address_phrase_is_possible_root_for_components(other_phrase, options.address_components);
|
|
skip_edge_phrase = false;
|
|
if (current_phrase_have_edge_ignorable || (current_phrase_have_ambiguous && current_phrase_have_canonical)) {
|
|
// don't delete the "E" in "Avenue E"
|
|
log_debug("final phrase is edge ignorable out of two phrases. Checking previous phrase is ignorable.\n");
|
|
|
|
skip_edge_phrase = !(address_phrase_is_ignorable_for_components(other_phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(other_phrase) && address_phrase_is_possible_root_for_components(other_phrase, options.address_components))) && string_tree_num_tokens(tree) > 0;
|
|
} else {
|
|
log_debug("final phrase is not edge-ignorable out of two phrases. Checking previous phrase is edge ignorable.\n");
|
|
// delete "St" in "E St"
|
|
other_phrase_is_ignorable = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components) || (address_phrase_in_dictionary(other_phrase, DICTIONARY_AMBIGUOUS_EXPANSION) && address_phrase_has_canonical_interpretation(other_phrase));
|
|
skip_edge_phrase = other_phrase_is_ignorable && address_phrase_is_ignorable_for_components(phrase, options.address_components) && !(address_phrase_has_canonical_interpretation(phrase) && address_phrase_is_possible_root_for_components(phrase, options.address_components));
|
|
|
|
//skip_edge_phrase = address_phrase_is_edge_ignorable_for_components(other_phrase, options.address_components);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (phrase.start == prev_phrase.start && phrase.len == prev_phrase.len && skipped_last_edge_phrase) {
|
|
skip_edge_phrase = true;
|
|
}
|
|
|
|
for (size_t j = 0; j < expansions->n; j++) {
|
|
if (skip_edge_phrase) {
|
|
skipped_last_edge_phrase = true;
|
|
log_debug("skip edge phrase\n");
|
|
continue;
|
|
} else {
|
|
skipped_last_edge_phrase = false;
|
|
}
|
|
|
|
address_expansion_t expansion = expansions->a[j];
|
|
|
|
bool current_phrase_ignorable = false;
|
|
bool current_phrase_expandable = expand_phrases && expansion.canonical_index != NULL_CANONICAL_INDEX;
|
|
|
|
bool is_ambiguous = address_expansion_in_dictionary(expansion, DICTIONARY_AMBIGUOUS_EXPANSION);
|
|
bool is_valid_for_components = address_expansion_is_valid_for_components(expansion, options.address_components);
|
|
|
|
if (delete_phrases) {
|
|
bool is_ignorable = address_expansion_is_ignorable_for_components(expansion, options.address_components);
|
|
bool is_canonical = expansion.canonical_index == NULL_CANONICAL_INDEX;
|
|
|
|
log_debug("is_ignorable = %d, is_canonical = %d, is_ambiguous = %d, current_phrase_have_ambiguous = %d, current_phrase_have_unambiguous = %d, have_strictly_ignorable = %d, current_phrase_have_ignorable=%d, current_phrase_have_possible_root=%d\n", is_ignorable, is_canonical, is_ambiguous, current_phrase_have_ambiguous, current_phrase_have_unambiguous, have_strictly_ignorable, current_phrase_have_ignorable, current_phrase_have_possible_root);
|
|
|
|
current_phrase_expandable = current_phrase_expandable || current_phrase_have_ambiguous;
|
|
|
|
if (!is_canonical) {
|
|
char *canon = address_dictionary_get_canonical(expansion.canonical_index);
|
|
log_debug("canonical = %s\n", canon);
|
|
}
|
|
|
|
// Edge phrase calculations from above
|
|
if (current_phrase_have_edge_ignorable || other_phrase_is_ignorable) {
|
|
log_debug("current_phrase_have_edge_ignorable\n");
|
|
log_debug("skip_edge_phrase = %d\n", skip_edge_phrase);
|
|
current_phrase_ignorable = skip_edge_phrase;
|
|
// Don't delete "PH" in "PH 1" for unit expansions
|
|
} else if (is_ignorable && current_phrase_have_specifier) {
|
|
log_debug("current_phrase_have_specifier\n");
|
|
current_phrase_ignorable = false;
|
|
// Delete "Avenue" in "5th Avenue"
|
|
} else if (is_ignorable && is_canonical && !current_phrase_have_ambiguous) {
|
|
log_debug("is_ignorable && is_canonical && !current_phrase_have_ambiguous\n");
|
|
current_phrase_ignorable = have_non_phrase_tokens || (have_possible_root && !current_phrase_have_possible_root) || string_tree_num_tokens(tree) > 0;
|
|
log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable);
|
|
// Delete "Ave" in "5th Ave" or "Pl" in "Park Pl S"
|
|
} else if (is_ignorable && !is_canonical && !is_ambiguous && !current_phrase_have_ambiguous) {
|
|
log_debug("is_ignorable && !is_canonical && !current_phrase_have_ambiguous\n");
|
|
current_phrase_ignorable = have_non_phrase_tokens || (have_possible_root && !current_phrase_have_possible_root) || string_tree_num_tokens(tree) > 0;
|
|
log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable);
|
|
} else if (current_phrase_have_ambiguous && (have_non_phrase_word_tokens || is_numeric_component || have_canonical_phrases || have_possible_root)) {
|
|
log_debug("current_phrase_have_ambiguous && have_non_phrase_tokens = %d, have_canonical_phrases = %d, have_possible_root = %d, have_non_phrase_word_tokens = %d, is_numeric_component = %d, have_non_phrase_tokens = %d\n", have_non_phrase_tokens, have_canonical_phrases, have_possible_root, have_non_phrase_word_tokens, is_numeric_component, have_non_phrase_tokens);
|
|
current_phrase_ignorable = (is_ignorable && !(have_possible_root && !current_phrase_have_possible_root)) || (current_phrase_have_ambiguous && (have_non_phrase_word_tokens || (is_numeric_component && have_non_phrase_tokens)) && current_phrase_have_ignorable && current_phrase_have_unambiguous);
|
|
log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable);
|
|
} else if (!is_valid_for_components && !is_ambiguous) {
|
|
log_debug("!is_valid_for_components\n");
|
|
current_phrase_ignorable = current_phrase_have_ignorable || current_phrase_have_valid;
|
|
log_debug("current_phrase_ignorable = %d\n", current_phrase_ignorable);
|
|
} else {
|
|
log_debug("none of the above\n");
|
|
}
|
|
|
|
if (!current_phrase_ignorable && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 && !added_pre_phrase_space) {
|
|
log_debug("Adding space\n");
|
|
string_tree_add_string(tree, " ");
|
|
string_tree_finalize_token(tree);
|
|
last_added_was_whitespace = true;
|
|
added_pre_phrase_space = true;
|
|
}
|
|
|
|
}
|
|
|
|
if (current_phrase_ignorable) {
|
|
continue;
|
|
}
|
|
|
|
if (delete_phrases) {
|
|
current_phrase_expandable = !current_phrase_ignorable;
|
|
} else {
|
|
current_phrase_expandable = (expansion.address_components & options.address_components) || is_valid_for_components;
|
|
}
|
|
|
|
log_debug("current_phrase_expandable = %d\n", current_phrase_expandable);
|
|
|
|
log_debug("expansion.canonical_index = %d\n", expansion.canonical_index);
|
|
|
|
if (expansion.canonical_index != NULL_CANONICAL_INDEX && current_phrase_expandable) {
|
|
log_debug("expansion.canonical_index != NULL_CANONICAL_INDEX, delete_phrases = %d, phrase_option = %d\n", delete_phrases, phrase_option);
|
|
char *canonical = address_dictionary_get_canonical(expansion.canonical_index);
|
|
char *canonical_normalized = normalize_string_latin(canonical, strlen(canonical), normalize_string_options);
|
|
|
|
canonical = canonical_normalized != NULL ? canonical_normalized : canonical;
|
|
|
|
if (phrase.start + phrase.len < num_tokens - 1) {
|
|
token_t next_token = tokens[phrase.start + phrase.len];
|
|
if (!is_numeric_token(next_token.type)) {
|
|
log_debug("non-canonical phrase, adding canonical string: %s\n", canonical);
|
|
string_tree_add_string(tree, canonical);
|
|
last_added_was_whitespace = false;
|
|
} else {
|
|
log_debug("adding canonical with cstring_array methods: %s\n", canonical);
|
|
uint32_t start_index = cstring_array_start_token(tree->strings);
|
|
cstring_array_append_string(tree->strings, canonical);
|
|
cstring_array_append_string(tree->strings, " ");
|
|
last_added_was_whitespace = true;
|
|
cstring_array_terminate(tree->strings);
|
|
}
|
|
} else {
|
|
log_debug("adding canonical: %s\n", canonical);
|
|
string_tree_add_string(tree, canonical);
|
|
last_added_was_whitespace = false;
|
|
}
|
|
|
|
if (canonical_normalized != NULL) {
|
|
free(canonical_normalized);
|
|
}
|
|
} else if (expansion.canonical_index == NULL_CANONICAL_INDEX || !current_phrase_expandable) {
|
|
log_debug("canonical phrase, adding canonical string\n");
|
|
|
|
uint32_t start_index = cstring_array_start_token(tree->strings);
|
|
for (size_t k = phrase.start; k < phrase.start + phrase.len; k++) {
|
|
token = tokens[k];
|
|
if (token.type != WHITESPACE) {
|
|
cstring_array_append_string_len(tree->strings, str + token.offset, token.len);
|
|
last_added_was_whitespace = false;
|
|
} else {
|
|
log_debug("space\n");
|
|
cstring_array_append_string(tree->strings, " ");
|
|
last_added_was_whitespace = true;
|
|
}
|
|
}
|
|
cstring_array_terminate(tree->strings);
|
|
} else {
|
|
continue;
|
|
}
|
|
|
|
added_expansions++;
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
log_debug("expansion_valid_components == %d\n", expansion_valid_components);
|
|
|
|
if (added_expansions == 0 && (!delete_phrases || !expansion_valid_components)) {
|
|
if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
|
|
log_debug("Adding space\n");
|
|
string_tree_add_string(tree, " ");
|
|
string_tree_finalize_token(tree);
|
|
last_added_was_whitespace = true;
|
|
}
|
|
|
|
uint32_t start_index = cstring_array_start_token(tree->strings);
|
|
|
|
for (size_t j = phrase.start; j < phrase.start + phrase.len; j++) {
|
|
token = tokens[j];
|
|
|
|
if (token.type != WHITESPACE) {
|
|
log_debug("Adding canonical token, %.*s\n", (int)token.len, str + token.offset);
|
|
cstring_array_append_string_len(tree->strings, str + token.offset, token.len);
|
|
last_added_was_whitespace = false;
|
|
} else if (!last_added_was_whitespace) {
|
|
log_debug("Adding space\n");
|
|
cstring_array_append_string(tree->strings, " ");
|
|
last_added_was_whitespace = true;
|
|
}
|
|
|
|
}
|
|
|
|
cstring_array_terminate(tree->strings);
|
|
|
|
}
|
|
|
|
if (!delete_phrases || !expansion_valid_components || added_expansions > 0) {
|
|
log_debug("i=%zu\n", i);
|
|
bool end_of_phrase = false;
|
|
if (i < phrases->n - 1) {
|
|
phrase_t next_phrase = phrases->a[i + 1].phrase;
|
|
end_of_phrase = (next_phrase.start != phrase.start || next_phrase.len != phrase.len);
|
|
} else {
|
|
end_of_phrase = true;
|
|
}
|
|
|
|
log_debug("end_of_phrase=%d\n", end_of_phrase);
|
|
if (end_of_phrase) {
|
|
log_debug("finalize at i=%zu\n", i);
|
|
string_tree_finalize_token(tree);
|
|
}
|
|
}
|
|
|
|
start = phrase.start + phrase.len;
|
|
prev_phrase = phrase;
|
|
|
|
}
|
|
|
|
char_array_destroy(key);
|
|
|
|
end = (int)num_tokens;
|
|
|
|
if (phrase.start + phrase.len > 0 && phrase.start + phrase.len <= end - 1 && !last_added_was_whitespace) {
|
|
token_t next_token = tokens[phrase.start + phrase.len];
|
|
if (next_token.type != WHITESPACE && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0 && !is_ideographic(next_token.type)) {
|
|
log_debug("space after phrase\n");
|
|
string_tree_add_string(tree, " ");
|
|
last_added_was_whitespace = true;
|
|
string_tree_finalize_token(tree);
|
|
}
|
|
}
|
|
|
|
|
|
for (size_t j = start; j < end; j++) {
|
|
log_debug("On token %zu\n", j);
|
|
token_t token = tokens[j];
|
|
if (is_punctuation(token.type)) {
|
|
log_debug("last_was_punctuation\n");
|
|
last_was_punctuation = true;
|
|
continue;
|
|
}
|
|
|
|
if (token.type != WHITESPACE) {
|
|
if (j > 0 && last_was_punctuation && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
|
|
log_debug("Adding another space\n");
|
|
string_tree_add_string(tree, " ");
|
|
string_tree_finalize_token(tree);
|
|
}
|
|
log_debug("Adding previous token, %.*s\n", (int)token.len, str + token.offset);
|
|
|
|
bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options);
|
|
last_added_was_whitespace = false;
|
|
} else if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
|
|
log_debug("Adding space IV\n");
|
|
string_tree_add_string(tree, " ");
|
|
last_added_was_whitespace = true;
|
|
} else {
|
|
log_debug("Skipping token %zu\n", j);
|
|
continue;
|
|
}
|
|
|
|
last_was_punctuation = false;
|
|
string_tree_finalize_token(tree);
|
|
|
|
}
|
|
|
|
} else {
|
|
log_debug("phrases NULL\n");
|
|
for (size_t j = 0; j < num_tokens; j++) {
|
|
log_debug("On token %zu\n", j);
|
|
token_t token = tokens[j];
|
|
if (is_punctuation(token.type)) {
|
|
log_debug("punctuation, skipping\n");
|
|
last_was_punctuation = true;
|
|
continue;
|
|
}
|
|
|
|
if (token.type != WHITESPACE) {
|
|
if (last_was_punctuation && !last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
|
|
log_debug("Adding space V\n");
|
|
string_tree_add_string(tree, " ");
|
|
string_tree_finalize_token(tree);
|
|
}
|
|
|
|
bool have_period_affixes = add_period_affixes_or_token(tree, str, token, options);
|
|
last_added_was_whitespace = false;
|
|
} else if (!last_added_was_whitespace && string_tree_num_tokens(tree) > 0) {
|
|
log_debug("Adding space VI\n");
|
|
string_tree_add_string(tree, " ");
|
|
last_added_was_whitespace = true;
|
|
} else {
|
|
continue;
|
|
}
|
|
|
|
last_was_punctuation = false;
|
|
string_tree_finalize_token(tree);
|
|
}
|
|
}
|
|
|
|
if (phrases != NULL) {
|
|
phrase_language_array_destroy(phrases);
|
|
}
|
|
|
|
token_array_destroy(token_array);
|
|
|
|
return tree;
|
|
}
|
|
|
|
inline bool normalize_ordinal_suffixes(string_tree_t *tree, char *str, char *lang, token_t token, size_t i, token_t prev_token, libpostal_normalize_options_t options) {
|
|
size_t len_ordinal_suffix = valid_ordinal_suffix_len(str, token, prev_token, lang);
|
|
|
|
if (len_ordinal_suffix > 0) {
|
|
cstring_array *strings = tree->strings;
|
|
// Add the original form first. When this function returns true,
|
|
// add_normalized_strings_token won't be called a second time.
|
|
add_normalized_strings_token(strings, str, token, options);
|
|
token_t normalized_token = token;
|
|
normalized_token.len = token.len - len_ordinal_suffix;
|
|
add_normalized_strings_token(strings, str, normalized_token, options);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
inline void add_normalized_strings_tokenized(string_tree_t *tree, char *str, token_array *tokens, libpostal_normalize_options_t options) {
|
|
cstring_array *strings = tree->strings;
|
|
|
|
token_t prev_token = (token_t){0, 0, 0};
|
|
|
|
for (size_t i = 0; i < tokens->n; i++) {
|
|
token_t token = tokens->a[i];
|
|
bool have_phrase = false;
|
|
bool have_ordinal = false;
|
|
|
|
if (is_special_token(token.type)) {
|
|
string_tree_add_string_len(tree, str + token.offset, token.len);
|
|
string_tree_finalize_token(tree);
|
|
continue;
|
|
}
|
|
|
|
for (size_t j = 0; j < options.num_languages; j++) {
|
|
char *lang = options.languages[j];
|
|
if (expand_affixes(tree, str, lang, token, options)) {
|
|
have_phrase = true;
|
|
break;
|
|
}
|
|
|
|
if (normalize_ordinal_suffixes(tree, str, lang, token, i, prev_token, options)) {
|
|
have_ordinal = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!have_phrase && !have_ordinal) {
|
|
add_normalized_strings_token(strings, str, token, options);
|
|
}
|
|
|
|
string_tree_finalize_token(tree);
|
|
prev_token = token;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
void expand_alternative_phrase_option(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) {
|
|
size_t len = strlen(str);
|
|
token_array *tokens = tokenize_keep_whitespace(str);
|
|
string_tree_t *token_tree = string_tree_new_size(len);
|
|
|
|
add_normalized_strings_tokenized(token_tree, str, tokens, options);
|
|
|
|
string_tree_iterator_t *tokenized_iter = string_tree_iterator_new(token_tree);
|
|
|
|
string_tree_iterator_t *iter;
|
|
|
|
char_array *temp_string = char_array_new_size(len);
|
|
|
|
char *token;
|
|
|
|
char *lang;
|
|
|
|
kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining);
|
|
|
|
bool excessive_perms_outer = tokenized_iter->remaining >= EXCESSIVE_PERMUTATIONS;
|
|
|
|
if (!excessive_perms_outer) {
|
|
kh_resize(str_set, unique_strings, kh_size(unique_strings) + tokenized_iter->remaining);
|
|
}
|
|
|
|
log_debug("tokenized_iter->remaining=%d\n", tokenized_iter->remaining);
|
|
|
|
for (; !string_tree_iterator_done(tokenized_iter); string_tree_iterator_next(tokenized_iter)) {
|
|
char_array_clear(temp_string);
|
|
|
|
string_tree_iterator_foreach_token(tokenized_iter, token, {
|
|
if (token == NULL) {
|
|
continue;
|
|
}
|
|
char_array_append(temp_string, token);
|
|
})
|
|
char_array_terminate(temp_string);
|
|
|
|
char *tokenized_str = char_array_get_string(temp_string);
|
|
|
|
string_tree_t *alternatives;
|
|
|
|
int ret;
|
|
log_debug("Adding alternatives for single normalization\n");
|
|
alternatives = add_string_alternatives_phrase_option(tokenized_str, options, phrase_option);
|
|
|
|
log_debug("num strings = %" PRIu32 "\n", string_tree_num_strings(alternatives));
|
|
|
|
if (alternatives == NULL) {
|
|
log_debug("alternatives = NULL\n");
|
|
continue;
|
|
}
|
|
|
|
iter = string_tree_iterator_new(alternatives);
|
|
log_debug("iter->num_tokens=%d\n", iter->num_tokens);
|
|
log_debug("iter->remaining=%d\n", iter->remaining);
|
|
|
|
bool excessive_perms_inner = iter->remaining >= EXCESSIVE_PERMUTATIONS;
|
|
|
|
if (!excessive_perms_inner && !excessive_perms_outer) {
|
|
for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) {
|
|
char_array_clear(temp_string);
|
|
string_tree_iterator_foreach_token(iter, token, {
|
|
if (token == NULL) {
|
|
log_debug("token=NULL\n");
|
|
} else {
|
|
log_debug("token=%s\n", token);
|
|
char_array_append(temp_string, token);
|
|
}
|
|
})
|
|
char_array_terminate(temp_string);
|
|
|
|
token = char_array_get_string(temp_string);
|
|
|
|
size_t token_len = strlen(token);
|
|
|
|
if (token_len == 0) continue;
|
|
|
|
size_t left_spaces = string_left_spaces_len(token, token_len);
|
|
size_t right_spaces = string_right_spaces_len(token, token_len);
|
|
|
|
if (left_spaces + right_spaces == token_len) {
|
|
continue;
|
|
}
|
|
|
|
char *dupe_token = strndup(token + left_spaces, token_len - left_spaces - right_spaces);
|
|
|
|
log_debug("full string=%s\n", token);
|
|
khiter_t k = kh_get(str_set, unique_strings, dupe_token);
|
|
|
|
if (k == kh_end(unique_strings)) {
|
|
log_debug("doing postprocessing\n");
|
|
add_postprocessed_string(strings, dupe_token, options);
|
|
k = kh_put(str_set, unique_strings, dupe_token, &ret);
|
|
} else {
|
|
free(dupe_token);
|
|
}
|
|
|
|
log_debug("iter->remaining = %d\n", iter->remaining);
|
|
|
|
}
|
|
} else {
|
|
cstring_array_add_string(strings, tokenized_str);
|
|
}
|
|
|
|
string_tree_iterator_destroy(iter);
|
|
string_tree_destroy(alternatives);
|
|
|
|
if (excessive_perms_outer) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
string_tree_iterator_destroy(tokenized_iter);
|
|
string_tree_destroy(token_tree);
|
|
|
|
token_array_destroy(tokens);
|
|
|
|
char_array_destroy(temp_string);
|
|
}
|
|
|
|
|
|
|
|
void expand_alternative_phrase_option_languages(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, libpostal_normalize_options_t options, expansion_phrase_option_t phrase_option) {
|
|
char **temp_languages = calloc(1, sizeof(char *));
|
|
libpostal_normalize_options_t temp_options = options;
|
|
|
|
for (size_t i = 0; i < options.num_languages; i++) {
|
|
char *lang = options.languages[i];
|
|
|
|
temp_languages[0] = lang;
|
|
temp_options.languages = temp_languages;
|
|
temp_options.num_languages = 1;
|
|
expand_alternative_phrase_option(strings, unique_strings, str, temp_options, phrase_option);
|
|
}
|
|
|
|
if (options.num_languages == 0) {
|
|
temp_options.languages = options.languages;
|
|
temp_options.num_languages = options.num_languages;
|
|
expand_alternative_phrase_option(strings, unique_strings, str, temp_options, phrase_option);
|
|
}
|
|
|
|
free(temp_languages);
|
|
}
|
|
|
|
|
|
cstring_array *expand_address_phrase_option(char *input, libpostal_normalize_options_t options, size_t *n, expansion_phrase_option_t phrase_option) {
|
|
options.address_components |= LIBPOSTAL_ADDRESS_ANY;
|
|
|
|
uint64_t normalize_string_options = get_normalize_string_options(options);
|
|
|
|
size_t len = strlen(input);
|
|
|
|
libpostal_language_classifier_response_t *lang_response = NULL;
|
|
|
|
if (options.num_languages == 0) {
|
|
lang_response = classify_languages(input);
|
|
if (lang_response != NULL) {
|
|
options.num_languages = lang_response->num_languages;
|
|
options.languages = lang_response->languages;
|
|
}
|
|
}
|
|
|
|
string_tree_t *tree = normalize_string_languages(input, normalize_string_options, options.num_languages, options.languages);
|
|
|
|
cstring_array *strings = cstring_array_new_size(len * 2);
|
|
char_array *temp_string = char_array_new_size(len);
|
|
|
|
khash_t(str_set) *unique_strings = kh_init(str_set);
|
|
|
|
char *token;
|
|
|
|
log_debug("string_tree_num_tokens(tree) = %d\n", string_tree_num_tokens(tree));
|
|
|
|
if (string_tree_num_strings(tree) == 1) {
|
|
char *normalized = string_tree_get_alternative(tree, 0, 0);
|
|
expand_alternative_phrase_option_languages(strings, unique_strings, normalized, options, phrase_option);
|
|
|
|
} else {
|
|
log_debug("Adding alternatives for multiple normalizations\n");
|
|
string_tree_iterator_t *iter = string_tree_iterator_new(tree);
|
|
|
|
for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) {
|
|
char *segment;
|
|
char_array_clear(temp_string);
|
|
bool is_first = true;
|
|
|
|
string_tree_iterator_foreach_token(iter, segment, {
|
|
if (!is_first) {
|
|
char_array_append(temp_string, " ");
|
|
}
|
|
char_array_append(temp_string, segment);
|
|
is_first = false;
|
|
})
|
|
char_array_terminate(temp_string);
|
|
token = char_array_get_string(temp_string);
|
|
log_debug("current permutation = %s\n", token);
|
|
expand_alternative_phrase_option_languages(strings, unique_strings, token, options, phrase_option);
|
|
}
|
|
|
|
string_tree_iterator_destroy(iter);
|
|
}
|
|
|
|
char *key_str = NULL;
|
|
for (size_t i = kh_begin(unique_strings); i != kh_end(unique_strings); ++i) {
|
|
if (!kh_exist(unique_strings, i)) continue;
|
|
key_str = (char *)kh_key(unique_strings, i);
|
|
free(key_str);
|
|
}
|
|
|
|
kh_destroy(str_set, unique_strings);
|
|
|
|
if (lang_response != NULL) {
|
|
libpostal_language_classifier_response_destroy(lang_response);
|
|
}
|
|
|
|
char_array_destroy(temp_string);
|
|
string_tree_destroy(tree);
|
|
|
|
*n = cstring_array_num_strings(strings);
|
|
|
|
return strings;
|
|
}
|
|
|
|
cstring_array *expand_address(char *input, libpostal_normalize_options_t options, size_t *n) {
|
|
return expand_address_phrase_option(input, options, n, EXPAND_PHRASES);
|
|
}
|
|
|
|
cstring_array *expand_address_root(char *input, libpostal_normalize_options_t options, size_t *n) {
|
|
return expand_address_phrase_option(input, options, n, DELETE_PHRASES);
|
|
}
|
|
|
|
|
|
void expansion_array_destroy(char **expansions, size_t n) {
|
|
for (size_t i = 0; i < n; i++) {
|
|
free(expansions[i]);
|
|
}
|
|
free(expansions);
|
|
}
|
|
|