[similarity/dedupe] adding options for acronym alignments and address phrase matches in Soft-TFIDF. Acronym alignments will give higher similarity to NYU vs. "New York University" whereas phrase matches would match known phrases that share the same canonical like "Cty Rd" vs. "C.R." vs. "County Road" within the Soft-TFIDF similarity calculation.

2017-12-29 02:38:48 -05:00
parent 24a77ea03f
commit f1e6886536
2 changed files with 223 additions and 15 deletions
--- a/src/soft_tfidf.c
+++ b/src/soft_tfidf.c
@@ -1,7 +1,9 @@
 #include "soft_tfidf.h"
+#include "address_dictionary.h"
 #include "float_utils.h"
 #include "string_similarity.h"
 #include "string_utils.h"
+#include "log/log.h"

 static soft_tfidf_options_t DEFAULT_SOFT_TFIDF_OPTIONS = {
    .jaro_winkler_min = 0.9,
@@ -15,11 +17,105 @@ soft_tfidf_options_t soft_tfidf_default_options(void) {
    return DEFAULT_SOFT_TFIDF_OPTIONS;
 }

+bool compare_canonical(address_expansion_t e1, char **tokens1, phrase_t match1, address_expansion_t e2, char **tokens2, phrase_t match2) {
+    bool e1_canonical = e1.canonical_index == NULL_CANONICAL_INDEX;
+    bool e2_canonical = e2.canonical_index == NULL_CANONICAL_INDEX;

-double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options) {
+    if (!e1_canonical && !e2_canonical) {
+        return e1.canonical_index == e2.canonical_index;
+    } else if (e1_canonical && e2_canonical) {
+        if (match1.len != match2.len || match1.len == 0) return false;
+        for (size_t i = 0; i < match1.len; i++) {
+            char *s1 = tokens1[match1.start + i];
+            char *s2 = tokens2[match2.start + i];
+            if (!string_equals(s1, s2)) return false;
+        }
+        return true;
+    } else {
+        char **canonical_tokens = e1_canonical ? tokens1 : tokens2;
+        char *other_canonical = e1_canonical ? address_dictionary_get_canonical(e2.canonical_index) : address_dictionary_get_canonical(e1.canonical_index);
+        phrase_t match = e1_canonical ? match1 : match2;
+
+        size_t canonical_index = 0;
+        size_t canonical_len = strlen(other_canonical);
+
+        for (size_t i = match.start; i < match.start + match.len; i++) {
+            char *canonical_token = canonical_tokens[i];
+            size_t canonical_token_len = strlen(canonical_token);
+
+            if (canonical_index + canonical_token_len <= canonical_len && strncmp(other_canonical + canonical_index, canonical_token, canonical_token_len) == 0) {
+                canonical_index += canonical_token_len;
+
+                if (i < match.start + match.len - 1 && canonical_index < canonical_len && strncmp(other_canonical + canonical_index, " ", 1) == 0) {
+                    canonical_index++;
+                }
+            } else {
+                return false;
+            }
+        }
+        return true;
+    }
+}
+
+typedef enum {
+    CANONICAL_NO_MATCH = 0,
+    NEITHER_CANONICAL,
+    SECOND_CANONICAL,
+    FIRST_CANONICAL,
+    BOTH_CANONICAL
+} canonical_match_t;
+
+bool phrases_have_same_canonical(size_t num_tokens1, char **tokens1, size_t num_tokens2, char **tokens2, phrase_t match1, phrase_t match2, canonical_match_t *response) {
+    address_expansion_value_t *val1 = address_dictionary_get_expansions(match1.data);
+    address_expansion_value_t *val2 = address_dictionary_get_expansions(match2.data);
+
+    if (val1 == NULL || val2 == NULL) return false;
+
+    address_expansion_array *expansions_array1 = val1->expansions;
+    address_expansion_array *expansions_array2 = val2->expansions;
+
+    if (expansions_array1 == NULL || expansions_array2 == NULL) return false;
+
+    address_expansion_t *expansions1 = expansions_array1->a;
+    address_expansion_t *expansions2 = expansions_array2->a;
+
+    *response = CANONICAL_NO_MATCH;
+
+    bool same_canonical = false;
+    for (size_t i = 0; i < expansions_array1->n; i++) {
+        address_expansion_t e1 = expansions1[i];
+
+        for (size_t j = 0; j < expansions_array2->n; j++) {
+            address_expansion_t e2 = expansions2[j];
+
+            same_canonical = compare_canonical(e1, tokens1, match1, e2, tokens2, match2);
+            if (same_canonical) {
+                bool e1_canonical = e1.canonical_index == NULL_CANONICAL_INDEX;
+                bool e2_canonical = e2.canonical_index == NULL_CANONICAL_INDEX;
+
+                if (e1_canonical && e2_canonical) {
+                    *response = BOTH_CANONICAL;
+                } else if (e1_canonical) {
+                    *response = FIRST_CANONICAL;
+                } else if (e2_canonical) {
+                    *response = SECOND_CANONICAL;
+                } else {
+                    *response = NEITHER_CANONICAL;
+                }
+                break;
+            }
+        }
+        if (same_canonical) break;
+    }
+
+    return same_canonical;
+}
+
+
+double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, soft_tfidf_options_t options) {
    if (token_scores1 == NULL || token_scores2 == NULL) return 0.0;

-    if (num_tokens2 < num_tokens1) {
+    if (num_tokens1 > num_tokens2) {
        double *tmp_scores = token_scores1;
        token_scores1 = token_scores2;
        token_scores2 = tmp_scores;
@@ -27,6 +123,10 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s
        tokens1 = tokens2;
        tokens2 = tmp_tokens;

+        phrase_array *tmp_phrases = phrases1;
+        phrases1 = phrases2;
+        phrases2 = tmp_phrases;
+
        size_t tmp_num_tokens = num_tokens1;
        num_tokens1 = num_tokens2;
        num_tokens2 = tmp_num_tokens;
@@ -43,6 +143,14 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s
    uint32_array *t1_unicode;
    uint32_array *t2_unicode;

+    int64_array *phrase_memberships_array1 = NULL;
+    int64_array *phrase_memberships_array2 = NULL;
+    int64_t *phrase_memberships1 = NULL;
+    int64_t *phrase_memberships2 = NULL;
+
+    int64_array *acronym_memberships_array = NULL;
+    int64_t *acronym_memberships = NULL;
+
    t1_tokens_unicode = calloc(len1, sizeof(uint32_array *));
    if (t1_tokens_unicode == NULL) {
        total_sim = -1.0;
@@ -63,13 +171,37 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s
        goto return_soft_tfidf_score;
    }

-    for (size_t i = 0; i < len2; i++) {
-        t2_unicode = unicode_codepoints(tokens2[i]);
+    for (size_t j = 0; j < len2; j++) {
+        t2_unicode = unicode_codepoints(tokens2[j]);
        if (t2_unicode == NULL) {
            total_sim = -1.0;
            goto return_soft_tfidf_score;
        }
-        t2_tokens_unicode[i] = t2_unicode;
+        t2_tokens_unicode[j] = t2_unicode;
+    }
+
+
+    if (phrases1 != NULL && phrases2 != NULL) {
+        phrase_memberships_array1 = int64_array_new();
+        phrase_memberships_array2 = int64_array_new();
+        token_phrase_memberships(phrases1, phrase_memberships_array1, len1);
+        token_phrase_memberships(phrases2, phrase_memberships_array2, len2);
+
+        if (phrase_memberships_array1->n == len1) {
+            phrase_memberships1 = phrase_memberships_array1->a;
+        }
+
+        if (phrase_memberships_array2->n == len2) {
+            phrase_memberships2 = phrase_memberships_array2->a;
+        }
+    }
+
+    if (acronym_alignments != NULL) {
+        acronym_memberships_array = int64_array_new();
+        token_phrase_memberships(acronym_alignments, acronym_memberships_array, len2);
+        if (acronym_memberships_array->n == len2) {
+            acronym_memberships = acronym_memberships_array->a;
+        }
    }

    double jaro_winkler_min = options.jaro_winkler_min;
@@ -92,24 +224,65 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s
        double argmin_dist_sim = 0.0;
        size_t last_abbreviation = 0;
        double last_abbreviation_sim = 0.0;
-        bool have_abbreviation = false;        
+        bool have_abbreviation = false;
+        bool have_acronym_match = false;
+        phrase_t acronym_phrase = NULL_PHRASE;
+        bool have_phrase_match = false;
+        int64_t pm1 = phrase_memberships1 != NULL ? phrase_memberships1[i] : NULL_PHRASE_MEMBERSHIP;
+        phrase_t p1 = pm1 >= 0 ? phrases1->a[pm1] : NULL_PHRASE;
+        phrase_t argmax_phrase = NULL_PHRASE;
+
+        canonical_match_t best_canonical_phrase_response = CANONICAL_NO_MATCH;
+
        double t2_score;

        for (size_t j = 0; j < len2; j++) {
            char *t2 = tokens2[j];
            t2u = t2_tokens_unicode[j];
+            int64_t pm2 = phrase_memberships2 != NULL ? phrase_memberships2[j] : NULL_PHRASE_MEMBERSHIP;
+            phrase_t p2 = pm2 >= 0 ? phrases2->a[pm2] : NULL_PHRASE;
+
+            canonical_match_t canonical_response = CANONICAL_NO_MATCH;
+            if (p1.len > 0 && p2.len > 0 && phrases_have_same_canonical(num_tokens1, tokens1, num_tokens2, tokens2, p1, p2, &canonical_response)) {
+                if (canonical_response > best_canonical_phrase_response) {
+                    best_canonical_phrase_response = canonical_response;
+                    argmax_sim = j;
+                    argmax_phrase = p2;
+                    max_sim = 1.0;
+                    have_phrase_match = true;
+                    continue;
+                }
+            }
+
            if (unicode_equals(t1u, t2u)) {
                max_sim = 1.0;
                argmax_sim = j;
                break;
            }

+            if (acronym_memberships != NULL) {
+                int64_t acronym_membership = acronym_memberships[j];
+                log_debug("acronym_membership = %zd\n", acronym_membership);
+                if (acronym_membership >= 0) {
+                   acronym_phrase = acronym_alignments->a[acronym_membership];
+                   uint32_t acronym_match_index = acronym_phrase.data;
+                   if (acronym_match_index == i) {
+                        max_sim = 1.0;
+                        argmax_sim = j;
+                        have_acronym_match = true;
+                        log_debug("have acronym match\n");
+                        break;
+                   }
+                }
+            }
+
            double jaro_winkler = jaro_winkler_distance_unicode(t1u, t2u);
            if (jaro_winkler > max_sim) {
                max_sim = jaro_winkler;
                argmax_sim = j;
            }

+
            if (use_damerau_levenshtein) {
                size_t replace_cost = 0;
                ssize_t dist = damerau_levenshtein_distance_unicode(t1u, t2u, replace_cost);
@@ -128,20 +301,36 @@ double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_s
                    have_abbreviation = true;
                }
            }
+
        }

        // Note: here edit distance, affine gap and abbreviations are only used in the thresholding process.
        // Jaro-Winkler is still used to calculate similarity

-        if (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)) {
-            t2_score = token_scores2[argmax_sim];
-            total_sim += max_sim * t1_score * t2_score;
-        } else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) {
-            t2_score = token_scores2[argmin_dist];
-            total_sim += argmin_dist_sim * t1_score * t2_score;
-        } else if (use_abbreviations && have_abbreviation) {
-            t2_score = token_scores2[last_abbreviation];
-            total_sim += last_abbreviation_sim * t1_score * t2_score;
+        if (!have_acronym_match && !have_phrase_match) {
+            if (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)) {
+                log_debug("have max sim = %f\n", max_sim);
+                t2_score = token_scores2[argmax_sim];
+                total_sim += max_sim * t1_score * t2_score;
+            } else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) {
+                log_debug("levenshtein\n");
+                t2_score = token_scores2[argmin_dist];
+                total_sim += argmin_dist_sim * t1_score * t2_score;
+            } else if (use_abbreviations && have_abbreviation) {
+                log_debug("have abbreviation\n");
+                t2_score = token_scores2[last_abbreviation];
+                total_sim += last_abbreviation_sim * t1_score * t2_score;
+            }
+        } else if (have_phrase_match) {
+            for (size_t p = argmax_phrase.start; p < argmax_phrase.start + argmax_phrase.len; p++) {
+                t2_score = token_scores2[p];
+                total_sim += max_sim * t1_score * t2_score;
+            }
+        } else {
+            for (size_t p = acronym_phrase.start; p < acronym_phrase.start + acronym_phrase.len; p++) {
+                t2_score = token_scores2[p];
+                total_sim += max_sim * t1_score * t2_score;
+            }
        }
    }

@@ -166,5 +355,22 @@ return_soft_tfidf_score:
        free(t2_tokens_unicode);
    }

+    if (phrase_memberships_array1 != NULL) {
+        int64_array_destroy(phrase_memberships_array1);
+    }
+
+    if (phrase_memberships_array2 != NULL) {
+        int64_array_destroy(phrase_memberships_array2);
+    }
+
+    if (acronym_memberships_array != NULL) {
+        int64_array_destroy(acronym_memberships_array);
+    }
+
    return total_sim;
+}
+
+
+double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options) {
+    return soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, NULL, num_tokens2, tokens2, token_scores2, NULL, NULL, options);
 }
--- a/src/soft_tfidf.h
+++ b/src/soft_tfidf.h
@@ -4,6 +4,7 @@
 #include <stdlib.h>
 #include "collections.h"
 #include "libpostal.h"
+#include "trie_search.h"

 /*
 This is a variant of Soft-TFIDF as described in:
@@ -41,6 +42,7 @@ typedef struct soft_tfidf_options {

 soft_tfidf_options_t soft_tfidf_default_options(void);

+double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, soft_tfidf_options_t options);
 double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options);

 #endif