From c48c2b778c0b6fccabdf42989a324b4af5819126 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 30 Dec 2017 02:28:38 -0500 Subject: [PATCH] [dedupe] fixes to near dupe hashing, geohash lengths, cutting off name hashing at 50 unique tokens, fixing memory leaks, checking for valid geo components and returning NULL if one of the required fields isn't present --- src/near_dupe.c | 52 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/src/near_dupe.c b/src/near_dupe.c index b649d9d8..f2c08280 100644 --- a/src/near_dupe.c +++ b/src/near_dupe.c @@ -185,23 +185,24 @@ static cstring_array *geohash_and_neighbors(double latitude, double longitude, s if (geohash_precision == 0) return NULL; if (geohash_precision > MAX_GEOHASH_PRECISION) geohash_precision = MAX_GEOHASH_PRECISION; + size_t geohash_len = geohash_precision + 1; - char geohash[geohash_precision + 1]; - if (geohash_encode(latitude, longitude, geohash, geohash_precision) != GEOHASH_OK) { + char geohash[geohash_len]; + if (geohash_encode(latitude, longitude, geohash, geohash_len) != GEOHASH_OK) { return NULL; } - size_t neighbors_size = geohash_precision * 8 + 1; + size_t neighbors_size = geohash_len * 8; char neighbors[neighbors_size]; int num_strings = 0; if (geohash_neighbors(geohash, neighbors, neighbors_size, &num_strings) == GEOHASH_OK && num_strings == 8) { - cstring_array *strings = cstring_array_new_size(9 * geohash_precision + 1); + cstring_array *strings = cstring_array_new_size(9 * geohash_len); cstring_array_add_string(strings, geohash); for (int i = 0; i < num_strings; i++) { - char *neighbor = neighbors + geohash_precision * i; + char *neighbor = neighbors + geohash_len * i; cstring_array_add_string(strings, neighbor); } return strings; @@ -210,6 +211,8 @@ static cstring_array *geohash_and_neighbors(double latitude, double longitude, s return NULL; } +#define MAX_NAME_TOKENS 50 + cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normalize_options) { normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME | LIBPOSTAL_ADDRESS_ANY; @@ -276,7 +279,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal k = kh_get(str_set, unique_strings, dm_primary); - if (k == kh_end(unique_strings)) { + if (k == kh_end(unique_strings) && kh_size(unique_strings) <= MAX_NAME_TOKENS) { log_debug("adding dm_primary = %s\n", dm_primary); cstring_array_add_string(strings, dm_primary); k = kh_put(str_set, unique_strings, strdup(dm_primary), &ret); @@ -289,7 +292,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal k = kh_get(str_set, unique_strings, dm_secondary); - if (k == kh_end(unique_strings)) { + if (k == kh_end(unique_strings) && kh_size(unique_strings) <= MAX_NAME_TOKENS) { log_debug("adding dm_secondary = %s\n", dm_secondary); cstring_array_add_string(strings, dm_secondary); k = kh_put(str_set, unique_strings, strdup(dm_secondary), &ret); @@ -327,6 +330,8 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal token_array_destroy(token_array); char_array_destroy(combined_words_no_whitespace); + cstring_array_destroy(name_expansions); + const char *key; kh_foreach_key(unique_strings, key, { @@ -394,11 +399,32 @@ static inline void add_string_hash_permutations(cstring_array *near_dupe_hashes, cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages) { + if (!options.with_latlon && !options.with_city_or_equivalent && !options.with_postal_code) return NULL; + place_t *place = place_from_components(num_components, labels, values); log_debug("created place\n"); if (place == NULL) return NULL; - size_t n = 0; + bool have_valid_geo = options.with_latlon; + + if (!have_valid_geo && options.with_postal_code && place->postal_code != NULL) { + have_valid_geo = true; + } + + if (!have_valid_geo && options.with_city_or_equivalent && (place->city != NULL || place->city_district != NULL || place->suburb != NULL || place->island != NULL)) { + have_valid_geo = true; + } + + if (!have_valid_geo && options.with_small_containing_boundaries && (place->state_district != NULL)) { + have_valid_geo = true; + } + + + if (!have_valid_geo) { + log_debug("no valid geo\n"); + place_destroy(place); + return NULL; + } libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); @@ -413,7 +439,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, normalize_options.languages = lang_response->languages; } } else { - normalize_options.num_languages = languages; + normalize_options.num_languages = num_languages; normalize_options.languages = languages; } @@ -531,7 +557,7 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, } } - if (place->state_district != NULL) { + if (place->state_district != NULL && options.with_small_containing_boundaries) { size_t num_state_district_expansions = 0; cstring_array *state_district_expansions = expand_address_root(place->state_district, normalize_options, &num_state_district_expansions); if (containing_expansions == NULL) { @@ -560,8 +586,6 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, return NULL; } - bool added = false; - num_name_expansions = name_expansions != NULL ? cstring_array_num_strings(name_expansions) : 0; num_street_expansions = street_expansions != NULL ? cstring_array_num_strings(street_expansions) : 0; num_house_number_expansions = house_number_expansions != NULL ? cstring_array_num_strings(house_number_expansions) : 0; @@ -881,6 +905,10 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, } + if (place != NULL) { + place_destroy(place); + } + if (tree != NULL) { string_tree_destroy(tree); }