[dedupe] for near-dupe hashing, remove whitespace from root expansions so something like "Ocean Walk Dr" and "Oceanwalk Dr" will have a chance of matching downstream
This commit is contained in:
@@ -114,7 +114,36 @@
|
||||
#define PO_BOX_CONTAINING_KEY_PREFIX PO_BOX_KEY_PREFIX CONTAINING_BOUNDARY_PREFIX
|
||||
#define PO_BOX_POSTCODE_KEY_PREFIX PO_BOX_KEY_PREFIX POSTCODE_KEY_PREFIX
|
||||
|
||||
cstring_array *expanded_component_combined(char *input, libpostal_normalize_options_t options, size_t *n) {
|
||||
|
||||
bool cstring_array_add_string_no_whitespace(cstring_array *strings, char *str) {
|
||||
if (strings == NULL || str == NULL) return false;
|
||||
size_t start = 0;
|
||||
|
||||
size_t len = strlen(str);
|
||||
|
||||
cstring_array_start_token(strings);
|
||||
|
||||
uint8_t *ptr = (uint8_t *)str;
|
||||
ssize_t char_len;
|
||||
int32_t ch;
|
||||
ssize_t token_len = -1;
|
||||
|
||||
while ((token_len = string_next_whitespace(str + start)) > 0) {
|
||||
char_array_append_len(strings->str, str + start, token_len);
|
||||
start += token_len;
|
||||
|
||||
char_len = utf8proc_iterate(ptr + start, len - start, &ch);
|
||||
start += char_len;
|
||||
}
|
||||
|
||||
char_array_append_len(strings->str, str + start, len - start);
|
||||
char_array_terminate(strings->str);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
cstring_array *expanded_component_combined(char *input, libpostal_normalize_options_t options, bool remove_spaces, size_t *n) {
|
||||
size_t num_expansions = 0;
|
||||
cstring_array *expansions = expand_address(input, options, &num_expansions);
|
||||
|
||||
@@ -155,7 +184,11 @@ cstring_array *expanded_component_combined(char *input, libpostal_normalize_opti
|
||||
k = kh_get(str_set, unique_strings, expansion);
|
||||
|
||||
if (k == kh_end(unique_strings)) {
|
||||
if (remove_spaces) {
|
||||
cstring_array_add_string_no_whitespace(all_expansions, expansion);
|
||||
} else {
|
||||
cstring_array_add_string(all_expansions, expansion);
|
||||
}
|
||||
k = kh_put(str_set, unique_strings, expansion, &ret);
|
||||
if (ret < 0) {
|
||||
break;
|
||||
@@ -664,13 +697,15 @@ cstring_array *near_dupe_hashes_languages(size_t num_components, char **labels,
|
||||
}
|
||||
}
|
||||
|
||||
bool remove_spaces = false;
|
||||
|
||||
cstring_array *street_expansions = NULL;
|
||||
size_t num_street_expansions = 0;
|
||||
if (place->street != NULL) {
|
||||
remove_spaces = true;
|
||||
log_debug("Doing street expansions for %s\n", place->street);
|
||||
normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY;
|
||||
street_expansions = expanded_component_combined(place->street, normalize_options, &num_street_expansions);
|
||||
street_expansions = expanded_component_combined(place->street, normalize_options, remove_spaces, &num_street_expansions);
|
||||
log_debug("Got %zu street expansions\n", num_street_expansions);
|
||||
}
|
||||
|
||||
|
||||
@@ -789,6 +789,28 @@ inline bool string_contains_period(char *str) {
|
||||
return string_next_codepoint(str, string_next_codepoint(str, PERIOD_CODEPOINT)) >= 0;
|
||||
}
|
||||
|
||||
ssize_t string_next_whitespace_len(char *str, size_t len) {
|
||||
uint8_t *ptr = (uint8_t *)str;
|
||||
int32_t ch;
|
||||
ssize_t idx = 0;
|
||||
|
||||
while (idx < len) {
|
||||
ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
|
||||
|
||||
if (char_len <= 0 || ch == 0) break;
|
||||
|
||||
if (utf8_is_whitespace(ch)) return idx;
|
||||
ptr += char_len;
|
||||
idx += char_len;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
ssize_t string_next_whitespace(char *str) {
|
||||
return string_next_whitespace_len(str, strlen(str));
|
||||
}
|
||||
|
||||
|
||||
size_t string_right_spaces_len(char *str, size_t len) {
|
||||
size_t spaces = 0;
|
||||
|
||||
|
||||
@@ -122,6 +122,9 @@ ssize_t string_next_period(char *str);
|
||||
bool string_contains_period_len(char *str, size_t len);
|
||||
bool string_contains_period(char *str);
|
||||
|
||||
ssize_t string_next_whitespace_len(char *str, size_t len);
|
||||
ssize_t string_next_whitespace(char *str);
|
||||
|
||||
size_t string_left_spaces_len(char *str, size_t len);
|
||||
size_t string_right_spaces_len(char *str, size_t len);
|
||||
char *string_trim(char *str);
|
||||
|
||||
Reference in New Issue
Block a user