From f1e68865366ed1287f3f1d573d0f05e88416d248 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 29 Dec 2017 02:38:48 -0500 Subject: [PATCH] [similarity/dedupe] adding options for acronym alignments and address phrase matches in Soft-TFIDF. Acronym alignments will give higher similarity to NYU vs. "New York University" whereas phrase matches would match known phrases that share the same canonical like "Cty Rd" vs. "C.R." vs. "County Road" within the Soft-TFIDF similarity calculation. --- src/soft_tfidf.c | 236 ++++++++++++++++++++++++++++++++++++++++++++--- src/soft_tfidf.h | 2 + 2 files changed, 223 insertions(+), 15 deletions(-) diff --git a/src/soft_tfidf.c b/src/soft_tfidf.c index 1bd43220..f5f1800f 100644 --- a/src/soft_tfidf.c +++ b/src/soft_tfidf.c @@ -1,7 +1,9 @@ #include "soft_tfidf.h" +#include "address_dictionary.h" #include "float_utils.h" #include "string_similarity.h" #include "string_utils.h" +#include "log/log.h" static soft_tfidf_options_t DEFAULT_SOFT_TFIDF_OPTIONS = { .jaro_winkler_min = 0.9, @@ -15,11 +17,105 @@ soft_tfidf_options_t soft_tfidf_default_options(void) { return DEFAULT_SOFT_TFIDF_OPTIONS; } +bool compare_canonical(address_expansion_t e1, char **tokens1, phrase_t match1, address_expansion_t e2, char **tokens2, phrase_t match2) { + bool e1_canonical = e1.canonical_index == NULL_CANONICAL_INDEX; + bool e2_canonical = e2.canonical_index == NULL_CANONICAL_INDEX; -double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options) { + if (!e1_canonical && !e2_canonical) { + return e1.canonical_index == e2.canonical_index; + } else if (e1_canonical && e2_canonical) { + if (match1.len != match2.len || match1.len == 0) return false; + for (size_t i = 0; i < match1.len; i++) { + char *s1 = tokens1[match1.start + i]; + char *s2 = tokens2[match2.start + i]; + if (!string_equals(s1, s2)) return false; + } + return true; + } else { + char **canonical_tokens = e1_canonical ? tokens1 : tokens2; + char *other_canonical = e1_canonical ? address_dictionary_get_canonical(e2.canonical_index) : address_dictionary_get_canonical(e1.canonical_index); + phrase_t match = e1_canonical ? match1 : match2; + + size_t canonical_index = 0; + size_t canonical_len = strlen(other_canonical); + + for (size_t i = match.start; i < match.start + match.len; i++) { + char *canonical_token = canonical_tokens[i]; + size_t canonical_token_len = strlen(canonical_token); + + if (canonical_index + canonical_token_len <= canonical_len && strncmp(other_canonical + canonical_index, canonical_token, canonical_token_len) == 0) { + canonical_index += canonical_token_len; + + if (i < match.start + match.len - 1 && canonical_index < canonical_len && strncmp(other_canonical + canonical_index, " ", 1) == 0) { + canonical_index++; + } + } else { + return false; + } + } + return true; + } +} + +typedef enum { + CANONICAL_NO_MATCH = 0, + NEITHER_CANONICAL, + SECOND_CANONICAL, + FIRST_CANONICAL, + BOTH_CANONICAL +} canonical_match_t; + +bool phrases_have_same_canonical(size_t num_tokens1, char **tokens1, size_t num_tokens2, char **tokens2, phrase_t match1, phrase_t match2, canonical_match_t *response) { + address_expansion_value_t *val1 = address_dictionary_get_expansions(match1.data); + address_expansion_value_t *val2 = address_dictionary_get_expansions(match2.data); + + if (val1 == NULL || val2 == NULL) return false; + + address_expansion_array *expansions_array1 = val1->expansions; + address_expansion_array *expansions_array2 = val2->expansions; + + if (expansions_array1 == NULL || expansions_array2 == NULL) return false; + + address_expansion_t *expansions1 = expansions_array1->a; + address_expansion_t *expansions2 = expansions_array2->a; + + *response = CANONICAL_NO_MATCH; + + bool same_canonical = false; + for (size_t i = 0; i < expansions_array1->n; i++) { + address_expansion_t e1 = expansions1[i]; + + for (size_t j = 0; j < expansions_array2->n; j++) { + address_expansion_t e2 = expansions2[j]; + + same_canonical = compare_canonical(e1, tokens1, match1, e2, tokens2, match2); + if (same_canonical) { + bool e1_canonical = e1.canonical_index == NULL_CANONICAL_INDEX; + bool e2_canonical = e2.canonical_index == NULL_CANONICAL_INDEX; + + if (e1_canonical && e2_canonical) { + *response = BOTH_CANONICAL; + } else if (e1_canonical) { + *response = FIRST_CANONICAL; + } else if (e2_canonical) { + *response = SECOND_CANONICAL; + } else { + *response = NEITHER_CANONICAL; + } + break; + } + } + if (same_canonical) break; + } + + return same_canonical; +} + + +double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, soft_tfidf_options_t options) { if (token_scores1 == NULL || token_scores2 == NULL) return 0.0; - if (num_tokens2 < num_tokens1) { + if (num_tokens1 > num_tokens2) { double *tmp_scores = token_scores1; token_scores1 = token_scores2; token_scores2 = tmp_scores; @@ -27,6 +123,10 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s tokens1 = tokens2; tokens2 = tmp_tokens; + phrase_array *tmp_phrases = phrases1; + phrases1 = phrases2; + phrases2 = tmp_phrases; + size_t tmp_num_tokens = num_tokens1; num_tokens1 = num_tokens2; num_tokens2 = tmp_num_tokens; @@ -43,6 +143,14 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s uint32_array *t1_unicode; uint32_array *t2_unicode; + int64_array *phrase_memberships_array1 = NULL; + int64_array *phrase_memberships_array2 = NULL; + int64_t *phrase_memberships1 = NULL; + int64_t *phrase_memberships2 = NULL; + + int64_array *acronym_memberships_array = NULL; + int64_t *acronym_memberships = NULL; + t1_tokens_unicode = calloc(len1, sizeof(uint32_array *)); if (t1_tokens_unicode == NULL) { total_sim = -1.0; @@ -63,13 +171,37 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s goto return_soft_tfidf_score; } - for (size_t i = 0; i < len2; i++) { - t2_unicode = unicode_codepoints(tokens2[i]); + for (size_t j = 0; j < len2; j++) { + t2_unicode = unicode_codepoints(tokens2[j]); if (t2_unicode == NULL) { total_sim = -1.0; goto return_soft_tfidf_score; } - t2_tokens_unicode[i] = t2_unicode; + t2_tokens_unicode[j] = t2_unicode; + } + + + if (phrases1 != NULL && phrases2 != NULL) { + phrase_memberships_array1 = int64_array_new(); + phrase_memberships_array2 = int64_array_new(); + token_phrase_memberships(phrases1, phrase_memberships_array1, len1); + token_phrase_memberships(phrases2, phrase_memberships_array2, len2); + + if (phrase_memberships_array1->n == len1) { + phrase_memberships1 = phrase_memberships_array1->a; + } + + if (phrase_memberships_array2->n == len2) { + phrase_memberships2 = phrase_memberships_array2->a; + } + } + + if (acronym_alignments != NULL) { + acronym_memberships_array = int64_array_new(); + token_phrase_memberships(acronym_alignments, acronym_memberships_array, len2); + if (acronym_memberships_array->n == len2) { + acronym_memberships = acronym_memberships_array->a; + } } double jaro_winkler_min = options.jaro_winkler_min; @@ -92,24 +224,65 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s double argmin_dist_sim = 0.0; size_t last_abbreviation = 0; double last_abbreviation_sim = 0.0; - bool have_abbreviation = false; + bool have_abbreviation = false; + bool have_acronym_match = false; + phrase_t acronym_phrase = NULL_PHRASE; + bool have_phrase_match = false; + int64_t pm1 = phrase_memberships1 != NULL ? phrase_memberships1[i] : NULL_PHRASE_MEMBERSHIP; + phrase_t p1 = pm1 >= 0 ? phrases1->a[pm1] : NULL_PHRASE; + phrase_t argmax_phrase = NULL_PHRASE; + + canonical_match_t best_canonical_phrase_response = CANONICAL_NO_MATCH; + double t2_score; for (size_t j = 0; j < len2; j++) { char *t2 = tokens2[j]; t2u = t2_tokens_unicode[j]; + int64_t pm2 = phrase_memberships2 != NULL ? phrase_memberships2[j] : NULL_PHRASE_MEMBERSHIP; + phrase_t p2 = pm2 >= 0 ? phrases2->a[pm2] : NULL_PHRASE; + + canonical_match_t canonical_response = CANONICAL_NO_MATCH; + if (p1.len > 0 && p2.len > 0 && phrases_have_same_canonical(num_tokens1, tokens1, num_tokens2, tokens2, p1, p2, &canonical_response)) { + if (canonical_response > best_canonical_phrase_response) { + best_canonical_phrase_response = canonical_response; + argmax_sim = j; + argmax_phrase = p2; + max_sim = 1.0; + have_phrase_match = true; + continue; + } + } + if (unicode_equals(t1u, t2u)) { max_sim = 1.0; argmax_sim = j; break; } + if (acronym_memberships != NULL) { + int64_t acronym_membership = acronym_memberships[j]; + log_debug("acronym_membership = %zd\n", acronym_membership); + if (acronym_membership >= 0) { + acronym_phrase = acronym_alignments->a[acronym_membership]; + uint32_t acronym_match_index = acronym_phrase.data; + if (acronym_match_index == i) { + max_sim = 1.0; + argmax_sim = j; + have_acronym_match = true; + log_debug("have acronym match\n"); + break; + } + } + } + double jaro_winkler = jaro_winkler_distance_unicode(t1u, t2u); if (jaro_winkler > max_sim) { max_sim = jaro_winkler; argmax_sim = j; } + if (use_damerau_levenshtein) { size_t replace_cost = 0; ssize_t dist = damerau_levenshtein_distance_unicode(t1u, t2u, replace_cost); @@ -128,20 +301,36 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s have_abbreviation = true; } } + } // Note: here edit distance, affine gap and abbreviations are only used in the thresholding process. // Jaro-Winkler is still used to calculate similarity - if (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)) { - t2_score = token_scores2[argmax_sim]; - total_sim += max_sim * t1_score * t2_score; - } else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) { - t2_score = token_scores2[argmin_dist]; - total_sim += argmin_dist_sim * t1_score * t2_score; - } else if (use_abbreviations && have_abbreviation) { - t2_score = token_scores2[last_abbreviation]; - total_sim += last_abbreviation_sim * t1_score * t2_score; + if (!have_acronym_match && !have_phrase_match) { + if (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)) { + log_debug("have max sim = %f\n", max_sim); + t2_score = token_scores2[argmax_sim]; + total_sim += max_sim * t1_score * t2_score; + } else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) { + log_debug("levenshtein\n"); + t2_score = token_scores2[argmin_dist]; + total_sim += argmin_dist_sim * t1_score * t2_score; + } else if (use_abbreviations && have_abbreviation) { + log_debug("have abbreviation\n"); + t2_score = token_scores2[last_abbreviation]; + total_sim += last_abbreviation_sim * t1_score * t2_score; + } + } else if (have_phrase_match) { + for (size_t p = argmax_phrase.start; p < argmax_phrase.start + argmax_phrase.len; p++) { + t2_score = token_scores2[p]; + total_sim += max_sim * t1_score * t2_score; + } + } else { + for (size_t p = acronym_phrase.start; p < acronym_phrase.start + acronym_phrase.len; p++) { + t2_score = token_scores2[p]; + total_sim += max_sim * t1_score * t2_score; + } } } @@ -166,5 +355,22 @@ return_soft_tfidf_score: free(t2_tokens_unicode); } + if (phrase_memberships_array1 != NULL) { + int64_array_destroy(phrase_memberships_array1); + } + + if (phrase_memberships_array2 != NULL) { + int64_array_destroy(phrase_memberships_array2); + } + + if (acronym_memberships_array != NULL) { + int64_array_destroy(acronym_memberships_array); + } + return total_sim; +} + + +double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options) { + return soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, NULL, num_tokens2, tokens2, token_scores2, NULL, NULL, options); } \ No newline at end of file diff --git a/src/soft_tfidf.h b/src/soft_tfidf.h index 7d777fc5..244578ba 100644 --- a/src/soft_tfidf.h +++ b/src/soft_tfidf.h @@ -4,6 +4,7 @@ #include #include "collections.h" #include "libpostal.h" +#include "trie_search.h" /* This is a variant of Soft-TFIDF as described in: @@ -41,6 +42,7 @@ typedef struct soft_tfidf_options { soft_tfidf_options_t soft_tfidf_default_options(void); +double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, soft_tfidf_options_t options); double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options); #endif \ No newline at end of file