[dedupe] for near-dupe hashing, remove whitespace from root expansions so something like "Ocean Walk Dr" and "Oceanwalk Dr" will have a chance of matching downstream

This commit is contained in:
Al
2018-02-24 00:33:04 -05:00
parent 283be99b44
commit 835de327c3
3 changed files with 63 additions and 3 deletions

View File

@@ -114,7 +114,36 @@
#define PO_BOX_CONTAINING_KEY_PREFIX PO_BOX_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX #define PO_BOX_CONTAINING_KEY_PREFIX PO_BOX_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
#define PO_BOX_POSTCODE_KEY_PREFIX PO_BOX_KEY_PREFIX POSTCODE_KEY_PREFIX #define PO_BOX_POSTCODE_KEY_PREFIX PO_BOX_KEY_PREFIX POSTCODE_KEY_PREFIX
cstring_array *expanded_component_combined(char *input, libpostal_normalize_options_t options, size_t *n) {
bool cstring_array_add_string_no_whitespace(cstring_array *strings, char *str) {
if (strings == NULL || str == NULL) return false;
size_t start = 0;
size_t len = strlen(str);
cstring_array_start_token(strings);
uint8_t *ptr = (uint8_t *)str;
ssize_t char_len;
int32_t ch;
ssize_t token_len = -1;
while ((token_len = string_next_whitespace(str + start)) > 0) {
char_array_append_len(strings->str, str + start, token_len);
start += token_len;
char_len = utf8proc_iterate(ptr + start, len - start, &ch);
start += char_len;
}
char_array_append_len(strings->str, str + start, len - start);
char_array_terminate(strings->str);
return true;
}
cstring_array *expanded_component_combined(char *input, libpostal_normalize_options_t options, bool remove_spaces, size_t *n) {
size_t num_expansions = 0; size_t num_expansions = 0;
cstring_array *expansions = expand_address(input, options, &num_expansions); cstring_array *expansions = expand_address(input, options, &num_expansions);
@@ -155,7 +184,11 @@ cstring_array *expanded_component_combined(char *input, libpostal_normalize_opti
k = kh_get(str_set, unique_strings, expansion); k = kh_get(str_set, unique_strings, expansion);
if (k == kh_end(unique_strings)) { if (k == kh_end(unique_strings)) {
cstring_array_add_string(all_expansions, expansion); if (remove_spaces) {
cstring_array_add_string_no_whitespace(all_expansions, expansion);
} else {
cstring_array_add_string(all_expansions, expansion);
}
k = kh_put(str_set, unique_strings, expansion, &ret); k = kh_put(str_set, unique_strings, expansion, &ret);
if (ret < 0) { if (ret < 0) {
break; break;
@@ -664,13 +697,15 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels,
} }
} }
bool remove_spaces = false;
cstring_array *street_expansions = NULL; cstring_array *street_expansions = NULL;
size_t num_street_expansions = 0; size_t num_street_expansions = 0;
if (place->street != NULL) { if (place->street != NULL) {
remove_spaces = true;
log_debug("Doing street expansions for %s\n", place->street); log_debug("Doing street expansions for %s\n", place->street);
normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY; normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY;
street_expansions = expanded_component_combined(place->street, normalize_options, &num_street_expansions); street_expansions = expanded_component_combined(place->street, normalize_options, remove_spaces, &num_street_expansions);
log_debug("Got %zu street expansions\n", num_street_expansions); log_debug("Got %zu street expansions\n", num_street_expansions);
} }

View File

@@ -789,6 +789,28 @@ inline bool string_contains_period(char *str) {
return string_next_codepoint(str, string_next_codepoint(str, PERIOD_CODEPOINT)) >= 0; return string_next_codepoint(str, string_next_codepoint(str, PERIOD_CODEPOINT)) >= 0;
} }
ssize_t string_next_whitespace_len(char *str, size_t len) {
uint8_t *ptr = (uint8_t *)str;
int32_t ch;
ssize_t idx = 0;
while (idx < len) {
ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
if (char_len <= 0 || ch == 0) break;
if (utf8_is_whitespace(ch)) return idx;
ptr += char_len;
idx += char_len;
}
return -1;
}
ssize_t string_next_whitespace(char *str) {
return string_next_whitespace_len(str, strlen(str));
}
size_t string_right_spaces_len(char *str, size_t len) { size_t string_right_spaces_len(char *str, size_t len) {
size_t spaces = 0; size_t spaces = 0;

View File

@@ -122,6 +122,9 @@ ssize_t string_next_period(char *str);
bool string_contains_period_len(char *str, size_t len); bool string_contains_period_len(char *str, size_t len);
bool string_contains_period(char *str); bool string_contains_period(char *str);
ssize_t string_next_whitespace_len(char *str, size_t len);
ssize_t string_next_whitespace(char *str);
size_t string_left_spaces_len(char *str, size_t len); size_t string_left_spaces_len(char *str, size_t len);
size_t string_right_spaces_len(char *str, size_t len); size_t string_right_spaces_len(char *str, size_t len);
char *string_trim(char *str); char *string_trim(char *str);