Files
libpostal/src/near_dupe.c

1106 lines
52 KiB
C

#include <stdarg.h>
#include "log/log.h"
#include "near_dupe.h"
#include "acronyms.h"
#include "double_metaphone.h"
#include "expand.h"
#include "features.h"
#include "float_utils.h"
#include "place.h"
#include "scanner.h"
#include "string_utils.h"
#include "tokens.h"
#include "unicode_scripts.h"
#include "unicode_script_types.h"
#include "geohash/geohash.h"
#define MAX_GEOHASH_PRECISION 12
#define NAME_KEY_PREFIX "n"
#define ADDRESS_KEY_PREFIX "a"
#define UNIT_KEY_PREFIX "u"
#define PO_BOX_KEY_PREFIX "p"
#define HOUSE_NUMBER_KEY_PREFIX "h"
#define STREET_KEY_PREFIX "s"
#define GEOHASH_KEY_PREFIX "gh"
#define POSTCODE_KEY_PREFIX "pc"
#define CITY_KEY_PREFIX "ct"
#define CONTAINING_BOUNDARY_PREFIX "cb"
#define NAME_ADDRESS_UNIT_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX
#define NAME_ADDRESS_UNIT_CITY_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX
#define NAME_ADDRESS_UNIT_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define NAME_ADDRESS_UNIT_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX
#define NAME_ADDRESS_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX GEOHASH_KEY_PREFIX
#define NAME_ADDRESS_CITY_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX CITY_KEY_PREFIX
#define NAME_ADDRESS_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define NAME_ADDRESS_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX ADDRESS_KEY_PREFIX POSTCODE_KEY_PREFIX
#define NAME_HOUSE_NUMBER_UNIT_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX
#define NAME_HOUSE_NUMBER_UNIT_CITY_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX
#define NAME_HOUSE_NUMBER_UNIT_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define NAME_HOUSE_NUMBER_UNIT_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX
#define NAME_HOUSE_NUMBER_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX GEOHASH_KEY_PREFIX
#define NAME_HOUSE_NUMBER_CITY_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX CITY_KEY_PREFIX
#define NAME_HOUSE_NUMBER_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define NAME_HOUSE_NUMBER_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX POSTCODE_KEY_PREFIX
#define NAME_STREET_UNIT_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX
#define NAME_STREET_UNIT_CITY_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX
#define NAME_STREET_UNIT_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define NAME_STREET_UNIT_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX
#define NAME_STREET_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX GEOHASH_KEY_PREFIX
#define NAME_STREET_CITY_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX CITY_KEY_PREFIX
#define NAME_STREET_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define NAME_STREET_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX STREET_KEY_PREFIX POSTCODE_KEY_PREFIX
#define NAME_PO_BOX_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX PO_BOX_KEY_PREFIX GEOHASH_KEY_PREFIX
#define NAME_PO_BOX_CITY_KEY_PREFIX NAME_KEY_PREFIX PO_BOX_KEY_PREFIX CITY_KEY_PREFIX
#define NAME_PO_BOX_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX PO_BOX_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define NAME_PO_BOX_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX PO_BOX_KEY_PREFIX POSTCODE_KEY_PREFIX
#define NAME_UNIT_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX
#define NAME_UNIT_CITY_KEY_PREFIX NAME_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX
#define NAME_UNIT_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define NAME_UNIT_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX
#define NAME_GEOHASH_KEY_PREFIX NAME_KEY_PREFIX GEOHASH_KEY_PREFIX
#define NAME_CITY_KEY_PREFIX NAME_KEY_PREFIX CITY_KEY_PREFIX
#define NAME_CONTAINING_KEY_PREFIX NAME_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define NAME_POSTCODE_KEY_PREFIX NAME_KEY_PREFIX POSTCODE_KEY_PREFIX
#define ADDRESS_UNIT_GEOHASH_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX
#define ADDRESS_UNIT_CITY_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX
#define ADDRESS_UNIT_CONTAINING_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define ADDRESS_UNIT_POSTCODE_KEY_PREFIX ADDRESS_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX
#define ADDRESS_GEOHASH_KEY_PREFIX ADDRESS_KEY_PREFIX GEOHASH_KEY_PREFIX
#define ADDRESS_CITY_KEY_PREFIX ADDRESS_KEY_PREFIX CITY_KEY_PREFIX
#define ADDRESS_CONTAINING_KEY_PREFIX ADDRESS_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define ADDRESS_POSTCODE_KEY_PREFIX ADDRESS_KEY_PREFIX POSTCODE_KEY_PREFIX
#define HOUSE_NUMBER_UNIT_GEOHASH_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX
#define HOUSE_NUMBER_UNIT_CITY_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX
#define HOUSE_NUMBER_UNIT_CONTAINING_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define HOUSE_NUMBER_UNIT_POSTCODE_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX
#define HOUSE_NUMBER_GEOHASH_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX GEOHASH_KEY_PREFIX
#define HOUSE_NUMBER_CITY_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX CITY_KEY_PREFIX
#define HOUSE_NUMBER_CONTAINING_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define HOUSE_NUMBER_POSTCODE_KEY_PREFIX HOUSE_NUMBER_KEY_PREFIX POSTCODE_KEY_PREFIX
#define STREET_GEOHASH_KEY_PREFIX STREET_KEY_PREFIX GEOHASH_KEY_PREFIX
#define STREET_CITY_KEY_PREFIX STREET_KEY_PREFIX CITY_KEY_PREFIX
#define STREET_CONTAINING_KEY_PREFIX STREET_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define STREET_POSTCODE_KEY_PREFIX STREET_KEY_PREFIX POSTCODE_KEY_PREFIX
#define STREET_UNIT_GEOHASH_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX GEOHASH_KEY_PREFIX
#define STREET_UNIT_CITY_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX CITY_KEY_PREFIX
#define STREET_UNIT_CONTAINING_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define STREET_UNIT_POSTCODE_KEY_PREFIX STREET_KEY_PREFIX UNIT_KEY_PREFIX POSTCODE_KEY_PREFIX
#define PO_BOX_GEOHASH_KEY_PREFIX PO_BOX_KEY_PREFIX GEOHASH_KEY_PREFIX
#define PO_BOX_CITY_KEY_PREFIX PO_BOX_KEY_PREFIX CITY_KEY_PREFIX
#define PO_BOX_CONTAINING_KEY_PREFIX PO_BOX_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define PO_BOX_POSTCODE_KEY_PREFIX PO_BOX_KEY_PREFIX POSTCODE_KEY_PREFIX
cstring_array *expanded_component_combined(char *input, libpostal_normalize_options_t options, size_t *n) {
size_t num_expansions = 0;
cstring_array *expansions = expand_address(input, options, &num_expansions);
size_t num_root_expansions = 0;
cstring_array *root_expansions = expand_address_root(input, options, &num_root_expansions);
if (num_root_expansions == 0) {
cstring_array_destroy(root_expansions);
*n = num_expansions;
return expansions;
} else if (num_expansions == 0) {
cstring_array_destroy(expansions);
*n = num_root_expansions;
return root_expansions;
} else {
khash_t(str_set) *unique_strings = kh_init(str_set);
char *expansion;
khiter_t k;
int ret;
cstring_array *all_expansions = cstring_array_new();
for (size_t i = 0; i < num_expansions; i++) {
expansion = cstring_array_get_string(expansions, i);
k = kh_get(str_set, unique_strings, expansion);
if (k == kh_end(unique_strings)) {
cstring_array_add_string(all_expansions, expansion);
k = kh_put(str_set, unique_strings, expansion, &ret);
if (ret < 0) {
break;
}
}
}
for (size_t i = 0; i < num_root_expansions; i++) {
expansion = cstring_array_get_string(root_expansions, i);
k = kh_get(str_set, unique_strings, expansion);
if (k == kh_end(unique_strings)) {
cstring_array_add_string(all_expansions, expansion);
k = kh_put(str_set, unique_strings, expansion, &ret);
if (ret < 0) {
break;
}
}
}
*n = cstring_array_num_strings(all_expansions);
kh_destroy(str_set, unique_strings);
cstring_array_destroy(root_expansions);
cstring_array_destroy(expansions);
return all_expansions;
}
}
static inline cstring_array *expanded_component_root_with_fallback(char *input, libpostal_normalize_options_t options, size_t *n) {
cstring_array *root_expansions = expand_address_root(input, options, n);
if (*n > 0) {
return root_expansions;
} else {
cstring_array_destroy(root_expansions);
*n = 0;
return expand_address(input, options, n);
}
}
static cstring_array *geohash_and_neighbors(double latitude, double longitude, size_t geohash_precision) {
if (geohash_precision == 0) return NULL;
if (geohash_precision > MAX_GEOHASH_PRECISION) geohash_precision = MAX_GEOHASH_PRECISION;
size_t geohash_len = geohash_precision + 1;
char geohash[geohash_len];
if (geohash_encode(latitude, longitude, geohash, geohash_len) != GEOHASH_OK) {
return NULL;
}
size_t neighbors_size = geohash_len * 8;
char neighbors[neighbors_size];
int num_strings = 0;
if (geohash_neighbors(geohash, neighbors, neighbors_size, &num_strings) == GEOHASH_OK && num_strings == 8) {
cstring_array *strings = cstring_array_new_size(9 * geohash_len);
cstring_array_add_string(strings, geohash);
for (int i = 0; i < num_strings; i++) {
char *neighbor = neighbors + geohash_len * i;
cstring_array_add_string(strings, neighbor);
}
return strings;
}
return NULL;
}
static inline bool add_string_to_array_if_unique(char *str, cstring_array *strings, khash_t(str_set) *unique_strings) {
khiter_t k = kh_get(str_set, unique_strings, str);
int ret = 0;
if (k == kh_end(unique_strings)) {
cstring_array_add_string(strings, str);
k = kh_put(str_set, unique_strings, strdup(str), &ret);
if (ret < 0) {
return false;
}
return true;
}
return false;
}
static inline bool add_double_metaphone_to_array_if_unique(char *str, cstring_array *strings, khash_t(str_set) *unique_strings) {
if (str == NULL) return false;
double_metaphone_codes_t *dm_codes = double_metaphone(str);
if (dm_codes == NULL) {
return false;
}
char *dm_primary = dm_codes->primary;
char *dm_secondary = dm_codes->secondary;
if (!string_equals(dm_primary, "")) {
add_string_to_array_if_unique(dm_primary, strings, unique_strings);
if (!string_equals(dm_secondary, dm_primary)) {
add_string_to_array_if_unique(dm_secondary, strings, unique_strings);
}
}
double_metaphone_codes_destroy(dm_codes);
return true;
}
static inline bool add_double_metaphone_or_token_if_unique(char *str, cstring_array *strings, khash_t(str_set) *unique_strings) {
if (str == NULL) return false;
size_t len = strlen(str);
string_script_t token_script = get_string_script(str, len);
bool is_latin = token_script.len == len && token_script.script == SCRIPT_LATIN;
if (is_latin) {
return add_double_metaphone_to_array_if_unique(str, strings, unique_strings);
} else {
return add_string_to_array_if_unique(str, strings, unique_strings);
}
}
#define MAX_NAME_TOKENS 50
cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normalize_options) {
normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_ANY;
size_t num_expansions = 0;
cstring_array *name_expansions = expanded_component_root_with_fallback(name, normalize_options, &num_expansions);
if (num_expansions == 0) {
cstring_array_destroy(name_expansions);
return NULL;
}
size_t len = strlen(name);
char_array *token_string_array = char_array_new_size(len);
cstring_array *strings = cstring_array_new_size(len);
token_array *token_array = token_array_new();
uint32_array *stopwords_array = uint32_array_new();
char_array *combined_words_no_whitespace = char_array_new();
char_array *acronym_with_stopwords = char_array_new();
char_array *acronym_no_stopwords = char_array_new();
char_array *sub_acronym_with_stopwords = char_array_new();
char_array *sub_acronym_no_stopwords = char_array_new();
khash_t(str_set) *unique_strings = kh_init(str_set);
bool keep_whitespace = false;
for (size_t i = 0; i < num_expansions; i++) {
char *expansion = cstring_array_get_string(name_expansions, i);
log_debug("expansion = %s\n", expansion);
token_array_clear(token_array);
tokenize_add_tokens(token_array, expansion, strlen(expansion), keep_whitespace);
size_t num_tokens = token_array->n;
token_t *tokens = token_array->a;
token_t prev_token = NULL_TOKEN;
char *token_str;
char_array_clear(combined_words_no_whitespace);
for (size_t j = 0; j < num_tokens; j++) {
token_t token = tokens[j];
bool ideogram = is_ideographic(token.type);
string_script_t token_script = get_string_script(expansion + token.offset, token.len);
bool is_latin = token_script.len == token.len && token_script.script == SCRIPT_LATIN;
char_array_clear(token_string_array);
// For ideograms, since the "words" are characters, we use shingles of two characters
if (ideogram && j > 0 && is_ideographic(prev_token.type)) {
log_debug("cat ideogram\n");
char_array_cat_len(token_string_array, expansion + prev_token.offset, prev_token.len);
}
char_array_cat_len(combined_words_no_whitespace, expansion + token.offset, token.len);
// For Latin script, add double metaphone of the words
if (is_latin && !is_numeric_token(token.type) && !ideogram && !is_punctuation(token.type)) {
char_array_clear(token_string_array);
char_array_cat_len(token_string_array, expansion + token.offset, token.len);
token_str = char_array_get_string(token_string_array);
log_debug("token_str = %s\n", token_str);
add_double_metaphone_to_array_if_unique(token_str, strings, unique_strings);
// For non-Latin words (Arabic, Cyrllic, etc.) just add the word
// For ideograms, we do two-character shingles, so only add the first character if the string has one token
} else if (!ideogram || j > 0 || num_tokens == 1) {
char_array_cat_len(token_string_array, expansion + token.offset, token.len);
token_str = char_array_get_string(token_string_array);
log_debug("token_str = %s\n", token_str);
add_string_to_array_if_unique(token_str, strings, unique_strings);
}
prev_token = token;
}
if (combined_words_no_whitespace->n > 0) {
char *combined = char_array_get_string(combined_words_no_whitespace);
add_string_to_array_if_unique(combined, strings, unique_strings);
}
}
token_array_clear(token_array);
char *normalized = libpostal_normalize_string(name, LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS);
char *acronym = NULL;
if (normalized != NULL) {
keep_whitespace = false;
tokenize_add_tokens(token_array, normalized, strlen(normalized), keep_whitespace);
stopword_positions(stopwords_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages);
uint32_t *stopwords = stopwords_array->a;
size_t num_tokens = token_array->n;
token_t *tokens = token_array->a;
num_tokens = token_array->n;
if (num_tokens > 1) {
size_t num_stopwords_encountered = 0;
bool last_was_stopword = false;
bool last_was_punctuation = false;
for (size_t j = 0; j < num_tokens; j++) {
token_t token = tokens[j];
// Make sure it's a non-ideographic word token
if (is_word_token(token.type) && !is_ideographic(token.type)) {
uint8_t *ptr = (uint8_t *)normalized;
int32_t ch = 0;
ssize_t ch_len = utf8proc_iterate(ptr + token.offset, token.len, &ch);
if (ch_len > 0 && utf8_is_letter(utf8proc_category(ch))) {
bool is_stopword = stopwords[j] == 1;
if (!is_stopword && !last_was_punctuation) {
char_array_cat_len(acronym_with_stopwords, normalized + token.offset, ch_len);
char_array_cat_len(acronym_no_stopwords, normalized + token.offset, ch_len);
if (!(last_was_stopword && j == num_tokens - 1)) {
char_array_cat_len(sub_acronym_with_stopwords, normalized + token.offset, ch_len);
char_array_cat_len(sub_acronym_no_stopwords, normalized + token.offset, ch_len);
}
last_was_stopword = false;
} else {
if (!last_was_stopword && is_stopword) {
num_stopwords_encountered++;
}
char_array_cat_len(acronym_with_stopwords, normalized + token.offset, ch_len);
if (!is_stopword) {
char_array_cat_len(acronym_no_stopwords, normalized + token.offset, ch_len);
}
if ((num_stopwords_encountered % 2 == 0 || last_was_punctuation) && acronym_no_stopwords->n > 1) {
acronym = char_array_get_string(sub_acronym_with_stopwords);
log_debug("sub acronym stopwords = %s\n", acronym);
char_array_clear(sub_acronym_with_stopwords);
add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings);
acronym = char_array_get_string(sub_acronym_no_stopwords);
log_debug("sub acronym no stopwords = %s\n", acronym);
add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings);
char_array_clear(sub_acronym_no_stopwords);
} else if (!((last_was_stopword || last_was_punctuation) && j == num_tokens - 1)) {
char_array_cat_len(sub_acronym_with_stopwords, normalized + token.offset, ch_len);
}
last_was_stopword = is_stopword;
}
last_was_punctuation = false;
}
} else if (is_punctuation(token.type)) {
log_debug("punctuation\n");
last_was_punctuation = true;
}
}
}
free(normalized);
}
if (acronym_no_stopwords->n > 0) {
acronym = char_array_get_string(acronym_with_stopwords);
log_debug("acronym with stopwords = %s\n", acronym);
add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings);
}
if (acronym_with_stopwords->n > 0) {
acronym = char_array_get_string(acronym_no_stopwords);
log_debug("acronym no stopwords = %s\n", acronym);
add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings);
}
if (sub_acronym_no_stopwords->n > 0) {
acronym = char_array_get_string(sub_acronym_with_stopwords);
log_debug("final sub acronym stopwords = %s\n", acronym);
add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings);
}
if (sub_acronym_with_stopwords->n > 0) {
acronym = char_array_get_string(sub_acronym_no_stopwords);
log_debug("final sub acronym no stopwords = %s\n", acronym);
add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings);
}
char_array_destroy(token_string_array);
token_array_destroy(token_array);
char_array_destroy(combined_words_no_whitespace);
char_array_destroy(acronym_with_stopwords);
char_array_destroy(acronym_no_stopwords);
char_array_destroy(sub_acronym_with_stopwords);
char_array_destroy(sub_acronym_no_stopwords);
uint32_array_destroy(stopwords_array);
cstring_array_destroy(name_expansions);
const char *key;
kh_foreach_key(unique_strings, key, {
free((char *)key);
});
kh_destroy(str_set, unique_strings);
return strings;
}
static inline void add_string_arrays_to_tree(string_tree_t *tree, size_t n, va_list args) {
for (size_t i = 0; i < n; i++) {
cstring_array *a = va_arg(args, cstring_array *);
size_t num_strings = cstring_array_num_strings(a);
if (num_strings == 0) continue;
for (size_t j = 0; j < num_strings; j++) {
char *str = cstring_array_get_string(a, j);
string_tree_add_string(tree, str);
}
string_tree_finalize_token(tree);
}
va_end(args);
}
static inline void add_hashes_from_tree(cstring_array *near_dupe_hashes, char *prefix, string_tree_t *tree) {
string_tree_iterator_t *iter = string_tree_iterator_new(tree);
if (iter->num_tokens > 0) {
log_debug("iter->num_tokens = %u\n", iter->num_tokens);
for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) {
cstring_array_start_token(near_dupe_hashes);
cstring_array_append_string(near_dupe_hashes, prefix);
char *str;
string_tree_iterator_foreach_token(iter, str, {
cstring_array_append_string(near_dupe_hashes, "|");
cstring_array_append_string(near_dupe_hashes, str);
//log_debug("str=%s\n", str);
});
cstring_array_terminate(near_dupe_hashes);
}
}
string_tree_iterator_destroy(iter);
}
static inline void add_string_hash_permutations(cstring_array *near_dupe_hashes, char *prefix, string_tree_t *tree, size_t n, ...) {
string_tree_clear(tree);
log_debug("prefix=%s\n", prefix);
va_list args;
va_start(args, n);
add_string_arrays_to_tree(tree, n, args);
va_end(args);
log_debug("string_tree_num_strings(tree)=%u\n", string_tree_num_strings(tree));
add_hashes_from_tree(near_dupe_hashes, prefix, tree);
}
cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages) {
if (!options.with_latlon && !options.with_city_or_equivalent && !options.with_postal_code) return NULL;
place_t *place = place_from_components(num_components, labels, values);
log_debug("created place\n");
if (place == NULL) return NULL;
bool have_valid_geo = options.with_latlon;
if (!have_valid_geo && options.with_postal_code && place->postal_code != NULL) {
have_valid_geo = true;
}
if (!have_valid_geo && options.with_city_or_equivalent && (place->city != NULL || place->city_district != NULL || place->suburb != NULL || place->island != NULL)) {
have_valid_geo = true;
}
if (!have_valid_geo && options.with_small_containing_boundaries && (place->state_district != NULL)) {
have_valid_geo = true;
}
if (!have_valid_geo) {
log_debug("no valid geo\n");
place_destroy(place);
return NULL;
}
libpostal_normalize_options_t normalize_options = libpostal_get_default_options();
language_classifier_response_t *lang_response = NULL;
if (num_languages == 0) {
lang_response = place_languages(num_components, labels, values);
if (lang_response != NULL) {
log_debug("got %zu place languages\n", lang_response->num_languages);
normalize_options.num_languages = lang_response->num_languages;
normalize_options.languages = lang_response->languages;
}
} else {
normalize_options.num_languages = num_languages;
normalize_options.languages = languages;
}
string_tree_t *tree = string_tree_new();
cstring_array *name_expansions = NULL;
size_t num_name_expansions = 0;
if (place->name != NULL && options.with_name) {
log_debug("Doing name expansions for %s\n", place->name);
name_expansions = name_word_hashes(place->name, normalize_options);
if (name_expansions != NULL) {
num_name_expansions = cstring_array_num_strings(name_expansions);
log_debug("Got %zu name expansions\n", num_name_expansions);
}
}
cstring_array *street_expansions = NULL;
size_t num_street_expansions = 0;
if (place->street != NULL) {
log_debug("Doing street expansions for %s\n", place->street);
normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY;
street_expansions = expanded_component_combined(place->street, normalize_options, &num_street_expansions);
log_debug("Got %zu street expansions\n", num_street_expansions);
}
cstring_array *house_number_expansions = NULL;
size_t num_house_number_expansions = 0;
if (place->house_number != NULL) {
log_debug("Doing house number expansions for %s\n", place->house_number);
normalize_options.address_components = LIBPOSTAL_ADDRESS_HOUSE_NUMBER | LIBPOSTAL_ADDRESS_ANY;
house_number_expansions = expand_address_root(place->house_number, normalize_options, &num_house_number_expansions);
log_debug("Got %zu house number expansions\n", num_house_number_expansions);
}
cstring_array *unit_expansions = NULL;
size_t num_unit_expansions = 0;
if (place->unit != NULL && options.with_unit) {
log_debug("Doing unit expansions for %s\n", place->unit);
normalize_options.address_components = LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY;
unit_expansions = expand_address_root(place->unit, normalize_options, &num_unit_expansions);
log_debug("Got %zu unit expansions\n", num_unit_expansions);
}
cstring_array *building_expansions = NULL;
size_t num_building_expansions = 0;
if (place->building != NULL && options.with_unit) {
normalize_options.address_components = LIBPOSTAL_ADDRESS_UNIT | LIBPOSTAL_ADDRESS_ANY;
building_expansions = expand_address_root(place->building, normalize_options, &num_building_expansions);
}
cstring_array *level_expansions = NULL;
size_t num_level_expansions = 0;
if (place->level != NULL && options.with_unit) {
normalize_options.address_components = LIBPOSTAL_ADDRESS_LEVEL | LIBPOSTAL_ADDRESS_ANY;
level_expansions = expand_address_root(place->level, normalize_options, &num_level_expansions);
}
cstring_array *po_box_expansions = NULL;
size_t num_po_box_expansions = 0;
if (place->po_box != NULL) {
normalize_options.address_components = LIBPOSTAL_ADDRESS_PO_BOX | LIBPOSTAL_ADDRESS_ANY;
po_box_expansions = expand_address_root(place->po_box, normalize_options, &num_po_box_expansions);
}
cstring_array *place_expansions = NULL;
cstring_array *containing_expansions = NULL;
if (options.with_city_or_equivalent) {
normalize_options.address_components = LIBPOSTAL_ADDRESS_TOPONYM | LIBPOSTAL_ADDRESS_ANY;
if (place->city != NULL) {
size_t num_city_expansions = 0;
cstring_array *city_expansions = expand_address_root(place->city, normalize_options, &num_city_expansions);
if (place_expansions == NULL) {
place_expansions = city_expansions;
} else if (city_expansions != NULL && num_city_expansions > 0) {
cstring_array_extend(place_expansions, city_expansions);
cstring_array_destroy(city_expansions);
}
}
if (place->city_district != NULL) {
size_t num_city_district_expansions = 0;
cstring_array *city_district_expansions = expand_address_root(place->city_district, normalize_options, &num_city_district_expansions);
if (place_expansions == NULL) {
place_expansions = city_district_expansions;
} else if (city_district_expansions != NULL && num_city_district_expansions > 0) {
cstring_array_extend(place_expansions, city_district_expansions);
cstring_array_destroy(city_district_expansions);
}
}
if (place->suburb != NULL) {
size_t num_suburb_expansions = 0;
cstring_array *suburb_expansions = expand_address_root(place->suburb, normalize_options, &num_suburb_expansions);
if (place_expansions == NULL) {
place_expansions = suburb_expansions;
} else if (suburb_expansions != NULL && num_suburb_expansions > 0) {
cstring_array_extend(place_expansions, suburb_expansions);
cstring_array_destroy(suburb_expansions);
}
}
if (place->island != NULL) {
size_t num_island_expansions = 0;
cstring_array *island_expansions = expand_address_root(place->island, normalize_options, &num_island_expansions);
if (place_expansions == NULL) {
place_expansions = island_expansions;
} else if (island_expansions != NULL && num_island_expansions > 0) {
cstring_array_extend(place_expansions, island_expansions);
cstring_array_destroy(island_expansions);
}
}
if (place->state_district != NULL && options.with_small_containing_boundaries) {
size_t num_state_district_expansions = 0;
cstring_array *state_district_expansions = expand_address_root(place->state_district, normalize_options, &num_state_district_expansions);
if (containing_expansions == NULL) {
containing_expansions = state_district_expansions;
} else if (state_district_expansions != NULL && num_state_district_expansions > 0) {
cstring_array_extend(containing_expansions, state_district_expansions);
cstring_array_destroy(state_district_expansions);
}
}
}
cstring_array *postal_code_expansions = NULL;
size_t num_postal_code_expansions = 0;
if (options.with_postal_code && place->postal_code != NULL) {
normalize_options.address_components = LIBPOSTAL_ADDRESS_POSTAL_CODE | LIBPOSTAL_ADDRESS_ANY;
postal_code_expansions = expand_address_root(place->postal_code, normalize_options, &num_postal_code_expansions);
}
cstring_array *geohash_expansions = NULL;
if (options.with_latlon && !(double_equals(options.latitude, 0.0) && double_equals(options.longitude, 0.0))) {
geohash_expansions = geohash_and_neighbors(options.latitude, options.longitude, options.geohash_precision);
}
size_t num_geohash_expansions = geohash_expansions != NULL ? cstring_array_num_strings(geohash_expansions) : 0;
if (num_geohash_expansions == 0 && num_postal_code_expansions == 0 && place_expansions == NULL && containing_expansions == NULL) {
return NULL;
}
num_name_expansions = name_expansions != NULL ? cstring_array_num_strings(name_expansions) : 0;
num_street_expansions = street_expansions != NULL ? cstring_array_num_strings(street_expansions) : 0;
num_house_number_expansions = house_number_expansions != NULL ? cstring_array_num_strings(house_number_expansions) : 0;
num_po_box_expansions = po_box_expansions != NULL ? cstring_array_num_strings(po_box_expansions) : 0;
num_unit_expansions = unit_expansions != NULL ? cstring_array_num_strings(unit_expansions) : 0;
num_building_expansions = building_expansions != NULL ? cstring_array_num_strings(building_expansions) : 0;
num_level_expansions = level_expansions != NULL ? cstring_array_num_strings(level_expansions) : 0;
bool have_unit = num_unit_expansions > 0 || num_building_expansions > 0 || num_level_expansions > 0;
cstring_array *unit_or_equivalent_expansions = NULL;
if (num_unit_expansions > 0) {
unit_or_equivalent_expansions = unit_expansions;
} else if (num_building_expansions > 0) {
unit_or_equivalent_expansions = building_expansions;
} else if (num_level_expansions > 0) {
unit_or_equivalent_expansions = level_expansions;
}
cstring_array *near_dupe_hashes = cstring_array_new();
if (num_name_expansions > 0) {
if (num_street_expansions > 0 && num_house_number_expansions > 0 && options.name_and_address_keys) {
// Have street, house number, and unit
if (have_unit) {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_UNIT_GEOHASH_KEY_PREFIX, tree, 5, name_expansions, street_expansions, house_number_expansions, unit_or_equivalent_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_UNIT_CITY_KEY_PREFIX, tree, 5, name_expansions, street_expansions, house_number_expansions, unit_or_equivalent_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_UNIT_CONTAINING_KEY_PREFIX, tree, 5, name_expansions, street_expansions, house_number_expansions, unit_or_equivalent_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_UNIT_POSTCODE_KEY_PREFIX, tree, 5, name_expansions, street_expansions, house_number_expansions, unit_or_equivalent_expansions, postal_code_expansions);
}
// Have street and house number, no unit
} else {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_GEOHASH_KEY_PREFIX, tree, 4, name_expansions, street_expansions, house_number_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_CITY_KEY_PREFIX, tree, 4, name_expansions, street_expansions, house_number_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_CONTAINING_KEY_PREFIX, tree, 4, name_expansions, street_expansions, house_number_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_ADDRESS_POSTCODE_KEY_PREFIX, tree, 4, name_expansions, street_expansions, house_number_expansions, postal_code_expansions);
}
}
// Japan, other places with no street names
} else if (num_house_number_expansions > 0 && options.name_and_address_keys) {
// House number and unit
if (have_unit) {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_UNIT_GEOHASH_KEY_PREFIX, tree, 4, name_expansions, house_number_expansions, unit_or_equivalent_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_UNIT_CITY_KEY_PREFIX, tree, 4, name_expansions, house_number_expansions, unit_or_equivalent_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_UNIT_CONTAINING_KEY_PREFIX, tree, 4, name_expansions, house_number_expansions, unit_or_equivalent_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_UNIT_POSTCODE_KEY_PREFIX, tree, 4, name_expansions, house_number_expansions, unit_or_equivalent_expansions, postal_code_expansions);
}
// House number, no unit
} else {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_GEOHASH_KEY_PREFIX, tree, 3, name_expansions, house_number_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_CITY_KEY_PREFIX, tree, 3, name_expansions, house_number_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_CONTAINING_KEY_PREFIX, tree, 3, name_expansions, house_number_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_HOUSE_NUMBER_POSTCODE_KEY_PREFIX, tree, 3, name_expansions, house_number_expansions, postal_code_expansions);
}
}
// Addresses in India, UK, Ireland, many university addresses, etc. may have house name + street with no house numbers
} else if (num_street_expansions > 0 && options.name_and_address_keys) {
// Have street, house number, and unit
if (have_unit) {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_STREET_UNIT_GEOHASH_KEY_PREFIX, tree, 4, name_expansions, street_expansions, unit_or_equivalent_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_STREET_UNIT_CITY_KEY_PREFIX, tree, 4, name_expansions, street_expansions, unit_or_equivalent_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_STREET_UNIT_CONTAINING_KEY_PREFIX, tree, 4, name_expansions, street_expansions, unit_or_equivalent_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_STREET_UNIT_POSTCODE_KEY_PREFIX, tree, 4, name_expansions, street_expansions, unit_or_equivalent_expansions, postal_code_expansions);
}
// Have street and house number, no unit
} else {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_STREET_GEOHASH_KEY_PREFIX, tree, 3, name_expansions, street_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_STREET_CITY_KEY_PREFIX, tree, 3, name_expansions, street_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_STREET_CONTAINING_KEY_PREFIX, tree, 3, name_expansions, street_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_STREET_POSTCODE_KEY_PREFIX, tree, 3, name_expansions, street_expansions, postal_code_expansions);
}
}
// PO Box only addresses, mailing addresses
} else if (num_po_box_expansions > 0 && options.name_and_address_keys) {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_PO_BOX_GEOHASH_KEY_PREFIX, tree, 3, name_expansions, po_box_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_PO_BOX_CITY_KEY_PREFIX, tree, 3, name_expansions, po_box_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_PO_BOX_CONTAINING_KEY_PREFIX, tree, 3, name_expansions, po_box_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_PO_BOX_POSTCODE_KEY_PREFIX, tree, 3, name_expansions, po_box_expansions, postal_code_expansions);
}
// Only name
} else if (options.name_only_keys) {
// Have name and unit, some university addresses
if (have_unit) {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_UNIT_GEOHASH_KEY_PREFIX, tree, 3, name_expansions, unit_or_equivalent_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_UNIT_CITY_KEY_PREFIX, tree, 3, name_expansions, unit_or_equivalent_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_UNIT_CONTAINING_KEY_PREFIX, tree, 3, name_expansions, unit_or_equivalent_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_UNIT_POSTCODE_KEY_PREFIX, tree, 3, name_expansions, unit_or_equivalent_expansions, postal_code_expansions);
}
// Have name and geo only
} else {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_GEOHASH_KEY_PREFIX, tree, 2, name_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_CITY_KEY_PREFIX, tree, 2, name_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_CONTAINING_KEY_PREFIX, tree, 2, name_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, NAME_POSTCODE_KEY_PREFIX, tree, 2, name_expansions, postal_code_expansions);
}
}
}
}
if (options.address_only_keys) {
if (num_street_expansions > 0 && num_house_number_expansions > 0) {
// Have street, house number, and unit
if (have_unit) {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, ADDRESS_UNIT_GEOHASH_KEY_PREFIX, tree, 4, street_expansions, house_number_expansions, unit_or_equivalent_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, ADDRESS_UNIT_CITY_KEY_PREFIX, tree, 4, street_expansions, house_number_expansions, unit_or_equivalent_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, ADDRESS_UNIT_CONTAINING_KEY_PREFIX, tree, 4, street_expansions, house_number_expansions, unit_or_equivalent_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, ADDRESS_UNIT_POSTCODE_KEY_PREFIX, tree, 4, street_expansions, house_number_expansions, unit_or_equivalent_expansions, postal_code_expansions);
}
// Have street and house number, no unit
} else {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, ADDRESS_GEOHASH_KEY_PREFIX, tree, 3, street_expansions, house_number_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, ADDRESS_CITY_KEY_PREFIX, tree, 3, street_expansions, house_number_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, ADDRESS_CONTAINING_KEY_PREFIX, tree, 3, street_expansions, house_number_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, ADDRESS_POSTCODE_KEY_PREFIX, tree, 3, street_expansions, house_number_expansions, postal_code_expansions);
}
}
// Japan, other places with no street names
} else if (num_house_number_expansions > 0) {
// House number and unit
if (have_unit) {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_UNIT_GEOHASH_KEY_PREFIX, tree, 3, house_number_expansions, unit_or_equivalent_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_UNIT_CITY_KEY_PREFIX, tree, 3, house_number_expansions, unit_or_equivalent_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_UNIT_CONTAINING_KEY_PREFIX, tree, 3, house_number_expansions, unit_or_equivalent_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_UNIT_POSTCODE_KEY_PREFIX, tree, 3, house_number_expansions, unit_or_equivalent_expansions, postal_code_expansions);
}
// House number, no unit
} else {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_GEOHASH_KEY_PREFIX, tree, 2, house_number_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_CITY_KEY_PREFIX, tree, 2, house_number_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_CONTAINING_KEY_PREFIX, tree, 2, house_number_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, HOUSE_NUMBER_POSTCODE_KEY_PREFIX, tree, 2, house_number_expansions, postal_code_expansions);
}
}
// Addresses in India, UK, Ireland, many university addresses, etc. may have house name + street with no house numbers
} else if (num_street_expansions > 0) {
// Have street, house number, and unit
if (have_unit) {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, STREET_UNIT_GEOHASH_KEY_PREFIX, tree, 3, street_expansions, unit_or_equivalent_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, STREET_UNIT_CITY_KEY_PREFIX, tree, 3, street_expansions, unit_or_equivalent_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, STREET_UNIT_CONTAINING_KEY_PREFIX, tree, 3, street_expansions, unit_or_equivalent_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, STREET_UNIT_POSTCODE_KEY_PREFIX, tree, 3, street_expansions, unit_or_equivalent_expansions, postal_code_expansions);
}
// Have street and house number, no unit
} else {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, STREET_GEOHASH_KEY_PREFIX, tree, 2, street_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, STREET_CITY_KEY_PREFIX, tree, 2, street_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, STREET_CONTAINING_KEY_PREFIX, tree, 2, street_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, STREET_POSTCODE_KEY_PREFIX, tree, 2, street_expansions, postal_code_expansions);
}
}
// PO Box only addresses, mailing addresses
} else if (num_po_box_expansions > 0) {
if (geohash_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, PO_BOX_GEOHASH_KEY_PREFIX, tree, 2, po_box_expansions, geohash_expansions);
}
if (place_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, PO_BOX_CITY_KEY_PREFIX, tree, 2, po_box_expansions, place_expansions);
}
if (containing_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, PO_BOX_CONTAINING_KEY_PREFIX, tree, 2, po_box_expansions, containing_expansions);
}
if (postal_code_expansions != NULL) {
add_string_hash_permutations(near_dupe_hashes, PO_BOX_POSTCODE_KEY_PREFIX, tree, 2, po_box_expansions, postal_code_expansions);
}
}
}
if (place != NULL) {
place_destroy(place);
}
if (tree != NULL) {
string_tree_destroy(tree);
}
if (name_expansions != NULL) {
cstring_array_destroy(name_expansions);
}
if (street_expansions != NULL) {
cstring_array_destroy(street_expansions);
}
if (house_number_expansions != NULL) {
cstring_array_destroy(house_number_expansions);
}
if (unit_expansions != NULL) {
cstring_array_destroy(unit_expansions);
}
if (building_expansions != NULL) {
cstring_array_destroy(building_expansions);
}
if (level_expansions != NULL) {
cstring_array_destroy(level_expansions);
}
if (po_box_expansions != NULL) {
cstring_array_destroy(po_box_expansions);
}
if (place_expansions != NULL) {
cstring_array_destroy(place_expansions);
}
if (containing_expansions != NULL) {
cstring_array_destroy(containing_expansions);
}
if (postal_code_expansions != NULL) {
cstring_array_destroy(postal_code_expansions);
}
if (geohash_expansions != NULL) {
cstring_array_destroy(geohash_expansions);
}
if (lang_response != NULL) {
language_classifier_response_destroy(lang_response);
}
return near_dupe_hashes;
}
inline cstring_array *near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options) {
return near_dupe_hashes_languages(num_components, labels, values, options, 0, NULL);
}