From 835de327c3145516df50638c4c6ec365dea434e8 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 24 Feb 2018 00:33:04 -0500 Subject: [PATCH] [dedupe] for near-dupe hashing, remove whitespace from root expansions so something like "Ocean Walk Dr" and "Oceanwalk Dr" will have a chance of matching downstream --- src/near_dupe.c | 41 ++++++++++++++++++++++++++++++++++++++--- src/string_utils.c | 22 ++++++++++++++++++++++ src/string_utils.h | 3 +++ 3 files changed, 63 insertions(+), 3 deletions(-) diff --git a/src/near_dupe.c b/src/near_dupe.c index 657950a1..0f9b954b 100644 --- a/src/near_dupe.c +++ b/src/near_dupe.c @@ -114,7 +114,36 @@ #define PO_BOX_CONTAINING_KEY_PREFIX PO_BOX_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX #define PO_BOX_POSTCODE_KEY_PREFIX PO_BOX_KEY_PREFIX POSTCODE_KEY_PREFIX -cstring_array *expanded_component_combined(char *input, libpostal_normalize_options_t options, size_t *n) { + +bool cstring_array_add_string_no_whitespace(cstring_array *strings, char *str) { + if (strings == NULL || str == NULL) return false; + size_t start = 0; + + size_t len = strlen(str); + + cstring_array_start_token(strings); + + uint8_t *ptr = (uint8_t *)str; + ssize_t char_len; + int32_t ch; + ssize_t token_len = -1; + + while ((token_len = string_next_whitespace(str + start)) > 0) { + char_array_append_len(strings->str, str + start, token_len); + start += token_len; + + char_len = utf8proc_iterate(ptr + start, len - start, &ch); + start += char_len; + } + + char_array_append_len(strings->str, str + start, len - start); + char_array_terminate(strings->str); + + return true; +} + + +cstring_array *expanded_component_combined(char *input, libpostal_normalize_options_t options, bool remove_spaces, size_t *n) { size_t num_expansions = 0; cstring_array *expansions = expand_address(input, options, &num_expansions); @@ -155,7 +184,11 @@ cstring_array *expanded_component_combined(char *input, libpostal_normalize_opti k = kh_get(str_set, unique_strings, expansion); if (k == kh_end(unique_strings)) { - cstring_array_add_string(all_expansions, expansion); + if (remove_spaces) { + cstring_array_add_string_no_whitespace(all_expansions, expansion); + } else { + cstring_array_add_string(all_expansions, expansion); + } k = kh_put(str_set, unique_strings, expansion, &ret); if (ret < 0) { break; @@ -664,13 +697,15 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels, } } + bool remove_spaces = false; cstring_array *street_expansions = NULL; size_t num_street_expansions = 0; if (place->street != NULL) { + remove_spaces = true; log_debug("Doing street expansions for %s\n", place->street); normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY; - street_expansions = expanded_component_combined(place->street, normalize_options, &num_street_expansions); + street_expansions = expanded_component_combined(place->street, normalize_options, remove_spaces, &num_street_expansions); log_debug("Got %zu street expansions\n", num_street_expansions); } diff --git a/src/string_utils.c b/src/string_utils.c index 9febcf92..1500fa10 100644 --- a/src/string_utils.c +++ b/src/string_utils.c @@ -789,6 +789,28 @@ inline bool string_contains_period(char *str) { return string_next_codepoint(str, string_next_codepoint(str, PERIOD_CODEPOINT)) >= 0; } +ssize_t string_next_whitespace_len(char *str, size_t len) { + uint8_t *ptr = (uint8_t *)str; + int32_t ch; + ssize_t idx = 0; + + while (idx < len) { + ssize_t char_len = utf8proc_iterate(ptr, len, &ch); + + if (char_len <= 0 || ch == 0) break; + + if (utf8_is_whitespace(ch)) return idx; + ptr += char_len; + idx += char_len; + } + return -1; +} + +ssize_t string_next_whitespace(char *str) { + return string_next_whitespace_len(str, strlen(str)); +} + + size_t string_right_spaces_len(char *str, size_t len) { size_t spaces = 0; diff --git a/src/string_utils.h b/src/string_utils.h index 915f9e39..db1aef22 100644 --- a/src/string_utils.h +++ b/src/string_utils.h @@ -122,6 +122,9 @@ ssize_t string_next_period(char *str); bool string_contains_period_len(char *str, size_t len); bool string_contains_period(char *str); +ssize_t string_next_whitespace_len(char *str, size_t len); +ssize_t string_next_whitespace(char *str); + size_t string_left_spaces_len(char *str, size_t len); size_t string_right_spaces_len(char *str, size_t len); char *string_trim(char *str);