From 098babfdee01b2e738a290910b79a7fa2acdf818 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 29 Dec 2017 04:48:00 -0500 Subject: [PATCH] [dedupe] adding the core pairwise deduping module which ties together most of the work on this branch. Includes simple phrase-aware exact deduping methods, with per-component variations as to whether e.g. a root expansion match counts as an exact duplicate or not (in a secondary unit, "No. 2" and "Apt 2" can be considered an exact match in English whereas we wouldn't want to make that kind of assumption for street e.g. "Park Ave" and "Park Pl"). The API is fairly low-level at present, and may require a few calls. Notably, we leave the TFIDF scores or other weighting schemes to the client. Since each component gets its own dupe classification, it leaves the door open for doing more specific checks around e.g. compound house numbers/ranges in the future. --- src/dedupe.c | 391 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/dedupe.h | 23 +++ 2 files changed, 414 insertions(+) create mode 100644 src/dedupe.c create mode 100644 src/dedupe.h diff --git a/src/dedupe.c b/src/dedupe.c new file mode 100644 index 00000000..bbf613a0 --- /dev/null +++ b/src/dedupe.c @@ -0,0 +1,391 @@ +#include "acronyms.h" +#include "address_parser.h" +#include "dedupe.h" +#include "expand.h" +#include "float_utils.h" +#include "jaccard.h" +#include "place.h" +#include "scanner.h" +#include "soft_tfidf.h" +#include "token_types.h" + +bool expansions_intersect(cstring_array *expansions1, cstring_array *expansions2) { + size_t n1 = cstring_array_num_strings(expansions1); + size_t n2 = cstring_array_num_strings(expansions2); + + bool intersect = false; + + for (size_t i = 0; i < n1; i++) { + char *e1 = cstring_array_get_string(expansions1, i); + for (size_t j = 0; j < n2; j++) { + char *e2 = cstring_array_get_string(expansions2, j); + if (string_equals(e1, e2)) { + intersect = true; + break; + } + } + if (intersect) break; + } + return intersect; +} + + +bool address_component_equals_root_option(char *s1, char *s2, libpostal_normalize_options_t options, bool root) { + uint64_t normalize_string_options = get_normalize_string_options(options); + + size_t n1, n2; + cstring_array *expansions1 = NULL; + cstring_array *expansions2 = NULL; + if (!root) { + expansions1 = expand_address(s1, options, &n1); + } else { + expansions1 = expand_address_root(s1, options, &n1); + } + + if (expansions1 == NULL) return false; + + if (!root) { + expansions2 = expand_address(s2, options, &n2); + } else { + expansions2 = expand_address_root(s2, options, &n2); + } + + if (expansions2 == NULL) { + cstring_array_destroy(expansions1); + return false; + } + + bool intersect = expansions_intersect(expansions1, expansions2); + + cstring_array_destroy(expansions1); + cstring_array_destroy(expansions2); + + return intersect; +} + +static inline bool address_component_equals(char *s1, char *s2, libpostal_normalize_options_t options) { + return address_component_equals_root_option(s1, s2, options, false); +} + +static inline bool address_component_equals_root(char *s1, char *s2, libpostal_normalize_options_t options) { + return address_component_equals_root_option(s1, s2, options, true); +} + + +static inline bool address_component_equals_root_fallback(char *s1, char *s2, libpostal_normalize_options_t options, bool root) { + return address_component_equals_root(s1, s2, options) || address_component_equals(s1, s2, options); +} + +libpostal_duplicate_status_t is_duplicate(char *value1, char *value2, libpostal_normalize_options_t normalize_options, libpostal_duplicate_options_t options, bool root_comparison_first, libpostal_duplicate_status_t root_comparison_status) { + if (value1 == NULL || value2 == NULL) { + return LIBPOSTAL_NULL_DUPLICATE_STATUS; + } + + normalize_options.num_languages = options.num_languages; + normalize_options.languages = options.languages; + + normalize_options.address_components |= LIBPOSTAL_ADDRESS_ANY; + + if (root_comparison_first) { + if (address_component_equals_root(value1, value2, normalize_options)) { + return root_comparison_status; + } else if (address_component_equals(value1, value2, normalize_options)) { + return LIBPOSTAL_EXACT_DUPLICATE; + } + } else { + if (address_component_equals(value1, value2, normalize_options)) { + return LIBPOSTAL_EXACT_DUPLICATE; + } else if (address_component_equals_root(value1, value2, normalize_options)) { + return root_comparison_status; + } + } + return LIBPOSTAL_NON_DUPLICATE; +} + +libpostal_duplicate_status_t is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME; + bool root_comparison_first = false; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} +libpostal_duplicate_status_t is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET; + bool root_comparison_first = false; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_HOUSE_NUMBER; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_UNIT; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_LEVEL; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_PO_BOX; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_POSTAL_CODE; + bool root_comparison_first = true; + libpostal_duplicate_status_t root_comparison_status = LIBPOSTAL_EXACT_DUPLICATE; + return is_duplicate(value1, value2, normalize_options, options, root_comparison_first, root_comparison_status); +} + +libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_TOPONYM; + + place_t *place1 = place_from_components(num_components1, labels1, values1); + place_t *place2 = place_from_components(num_components2, labels2, values2); + + bool city_match = false; + libpostal_duplicate_status_t dupe_status = LIBPOSTAL_NON_DUPLICATE; + + if (place1->city != NULL && place2->city != NULL) { + city_match = address_component_equals(place1->city, place2->city, normalize_options); + dupe_status = LIBPOSTAL_EXACT_DUPLICATE; + } + + if (!city_match && place1->city == NULL && place1->city_district != NULL && place2->city != NULL) { + city_match = address_component_equals(place1->city_district, place2->city, normalize_options); + dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + } + + if (!city_match && place1->city == NULL && place1->suburb != NULL && place2->city != NULL) { + city_match = address_component_equals(place1->suburb, place2->city, normalize_options); + dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + } + + if (!city_match && place2->city == NULL && place2->city_district != NULL && place1->city != NULL) { + city_match = address_component_equals(place1->city, place2->city_district, normalize_options); + dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + } + + if (!city_match && place2->city == NULL && place2->suburb != NULL && place1->city != NULL) { + city_match = address_component_equals(place1->suburb, place2->suburb, normalize_options); + dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + } + + if (!city_match) { + goto exit_destroy_places; + } + + if (city_match && place1->state_district != NULL && place2->state_district != NULL && !address_component_equals_root(place1->state_district, place2->state_district, normalize_options)) { + dupe_status = LIBPOSTAL_NON_DUPLICATE; + goto exit_destroy_places; + } + + if (city_match && place1->state != NULL && place2->state != NULL && !address_component_equals(place1->state, place2->state, normalize_options)) { + dupe_status = LIBPOSTAL_NON_DUPLICATE; + goto exit_destroy_places; + } + + if (city_match && place1->country != NULL && place2->country != NULL && !address_component_equals(place1->country, place2->country, normalize_options)) { + dupe_status = LIBPOSTAL_NON_DUPLICATE; + goto exit_destroy_places; + } + +exit_destroy_places: + place_destroy(place1); + place_destroy(place2); + return dupe_status; + +} + +char *joined_string_and_tokens_from_strings(char **strings, size_t num_strings, token_array *tokens) { + if (tokens == NULL || strings == NULL || num_strings == 0) return NULL; + token_array_clear(tokens); + + size_t full_len = 0; + for (size_t i = 0; i < num_strings; i++) { + full_len += strlen(strings[i]); + if (i < num_strings - 1) full_len++; + } + + char_array *a = char_array_new_size(full_len); + for (size_t i = 0; i < num_strings; i++) { + char *str = strings[i]; + size_t len = strlen(str); + size_t offset = a->n; + char_array_append(a, str); + + scanner_t scanner = scanner_from_string(str, len); + uint16_t token_type = scan_token(&scanner); + + token_t token = (token_t){offset, len, token_type}; + token_array_push(tokens, token); + if (i < num_strings - 1 && !is_ideographic(token.type)) { + char_array_append(a, " "); + } + } + + char_array_terminate(a); + return char_array_to_string(a); +} + +bool have_ideographic_word_tokens(token_array *token_array) { + if (token_array == NULL) return false; + + size_t n = token_array->n; + token_t *tokens = token_array->a; + for (size_t i = 0; i < n; i++) { + token_t token = tokens[i]; + if (is_ideographic(token.type) && is_word_token(token.type)) { + return true; + } + } + return false; +} + +libpostal_duplicate_status_similarity_t is_fuzzy_duplicate(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options, libpostal_normalize_options_t normalize_options, soft_tfidf_options_t soft_tfidf_options, bool do_acronyms) { + normalize_options.num_languages = options.num_languages; + normalize_options.languages = options.languages; + + normalize_options.address_components |= LIBPOSTAL_ADDRESS_ANY; + + double max_sim = 0.0; + + // Default is non-duplicate; + libpostal_duplicate_status_t dupe_status = LIBPOSTAL_NON_DUPLICATE; + + token_array *token_array1 = token_array_new_size(num_tokens1); + char *joined1 = joined_string_and_tokens_from_strings(tokens1, num_tokens1, token_array1); + + token_array *token_array2 = token_array_new_size(num_tokens2); + char *joined2 = joined_string_and_tokens_from_strings(tokens2, num_tokens2, token_array2); + + size_t num_languages = options.num_languages; + char **languages = options.languages; + + phrase_array *acronym_alignments = NULL; + + phrase_array *phrases1 = NULL; + phrase_array *phrases2 = NULL; + + bool is_ideographic = have_ideographic_word_tokens(token_array1) && have_ideographic_word_tokens(token_array2); + + if (!is_ideographic) { + if (do_acronyms) { + acronym_alignments = acronym_token_alignments(joined1, token_array1, joined2, token_array2, num_languages, languages); + } + + if (num_languages > 0) { + phrases1 = phrase_array_new(); + phrases2 = phrase_array_new(); + + for (size_t i = 0; i < num_languages; i++) { + char *lang = languages[i]; + phrase_array_clear(phrases1); + phrase_array_clear(phrases2); + + search_address_dictionaries_tokens_with_phrases(joined1, token_array1, lang, &phrases1); + search_address_dictionaries_tokens_with_phrases(joined2, token_array2, lang, &phrases2); + + double sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, num_tokens2, tokens2, token_scores2, phrases2, acronym_alignments, soft_tfidf_options); + if (sim > max_sim) { + max_sim = sim; + } + } + } else if (do_acronyms) { + max_sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, num_tokens2, tokens2, token_scores2, phrases2, acronym_alignments, soft_tfidf_options); + } else { + max_sim = soft_tfidf_similarity(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, soft_tfidf_options); + } + } else { + max_sim = jaccard_similarity_string_arrays(num_tokens1, tokens1, num_tokens2, tokens2); + if (string_equals(joined1, joined2)) { + dupe_status = LIBPOSTAL_EXACT_DUPLICATE; + } else if (address_component_equals_root(joined1, joined2, normalize_options)) { + dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + } + } + + if (dupe_status == LIBPOSTAL_NON_DUPLICATE) { + if (max_sim > options.likely_dupe_threshold || double_equals(max_sim, options.likely_dupe_threshold)) { + dupe_status = LIBPOSTAL_LIKELY_DUPLICATE; + } else if (max_sim > options.needs_review_threshold || double_equals(max_sim, options.needs_review_threshold)) { + dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW; + } + } + + if (phrases1 != NULL) { + phrase_array_destroy(phrases1); + } + + if (phrases2 != NULL) { + phrase_array_destroy(phrases2); + } + + if (acronym_alignments != NULL) { + phrase_array_destroy(acronym_alignments); + } + + if (token_array1 != NULL) { + token_array_destroy(token_array1); + } + + if (joined1 != NULL) { + free(joined1); + } + + if (token_array2 != NULL) { + token_array_destroy(token_array2); + } + + if (joined2 != NULL) { + free(joined2); + } + + return (libpostal_duplicate_status_similarity_t){dupe_status, max_sim}; +} + +inline libpostal_duplicate_status_similarity_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME; + + bool do_acronyms = true; + + soft_tfidf_options_t soft_tfidf_options = soft_tfidf_default_options(); + + return is_fuzzy_duplicate(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options, normalize_options, soft_tfidf_options, do_acronyms); +} + + +inline libpostal_duplicate_status_similarity_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { + libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); + normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET; + + // General purpose acronyms didn't make as much sense in the street name context + // things like County Road = CR should be handled by the address dictionaries + bool do_acronyms = false; + + soft_tfidf_options_t soft_tfidf_options = soft_tfidf_default_options(); + + return is_fuzzy_duplicate(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options, normalize_options, soft_tfidf_options, do_acronyms); +} diff --git a/src/dedupe.h b/src/dedupe.h new file mode 100644 index 00000000..c9a4fdf8 --- /dev/null +++ b/src/dedupe.h @@ -0,0 +1,23 @@ +#ifndef DEDUPE_H +#define DEDUPE_H + +#include +#include + +#include "libpostal.h" +#include "string_utils.h" + +libpostal_duplicate_status_t is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options); + +libpostal_duplicate_status_similarity_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); +libpostal_duplicate_status_similarity_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); + + +#endif \ No newline at end of file