[dedupe] for strict abbreviations (defined as sharing a prefix and a suffix, and containing matches+gaps only by the subtotaling affine gap measure), using the greater of the two scores. This accounts for cases where the abbreviated version may have a much higher weight in one string than the non-abbreviated version does in the other. Same for acronym alignments. Making sure there's a common prefix in regular abbeviation detection Capping the Soft-TFIDF similarity at 1.0.

2018-01-25 14:19:44 -05:00
parent b4cc7395a2
commit d0fe31d359
3 changed files with 34 additions and 5 deletions
--- a/src/soft_tfidf.c
+++ b/src/soft_tfidf.c
@@ -11,7 +11,9 @@ static soft_tfidf_options_t DEFAULT_SOFT_TFIDF_OPTIONS = {
    .jaro_winkler_min_length = 4,
    .damerau_levenshtein_max = 1,
    .damerau_levenshtein_min_length = 4,
-    .possible_affine_gap_abbreviations = true
+    .possible_affine_gap_abbreviations = true,
+    .strict_abbreviation_min_length = 4,
+    .strict_abbreviation_sim = 0.99
 };


@@ -210,6 +212,7 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
    size_t jaro_winkler_min_length = options.jaro_winkler_min_length;
    size_t damerau_levenshtein_max = options.damerau_levenshtein_max;
    size_t damerau_levenshtein_min_length = options.damerau_levenshtein_min_length;
+    size_t strict_abbreviation_min_length = options.strict_abbreviation_min_length;

    bool possible_affine_gap_abbreviations = options.possible_affine_gap_abbreviations;

@@ -232,6 +235,7 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
        size_t last_abbreviation = 0;
        double last_abbreviation_sim = 0.0;
        bool have_abbreviation = false;
+        bool have_strict_abbreviation = false;
        bool have_acronym_match = false;
        phrase_t acronym_phrase = NULL_PHRASE;
        bool have_phrase_match = false;
@@ -240,6 +244,7 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
        phrase_t argmax_phrase = NULL_PHRASE;
    
        bool use_jaro_winkler = t1_len >= jaro_winkler_min_length;
+        bool use_strict_abbreviation_sim = t1_len >= strict_abbreviation_min_length;
        bool use_damerau_levenshtein = damerau_levenshtein_max > 0 && t1_len >= damerau_levenshtein_min_length;
        log_debug("use_jaro_winkler = %d, use_damerau_levenshtein=%d\n", use_jaro_winkler, use_damerau_levenshtein);

@@ -321,6 +326,9 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
                    last_abbreviation = j;
                    last_abbreviation_sim = jaro_winkler;
                    have_abbreviation = true;
+                    if (use_strict_abbreviation_sim && possible_abbreviation_unicode_strict(t1u, t2u)) {
+                        last_abbreviation_sim = last_abbreviation_sim > options.strict_abbreviation_sim ? last_abbreviation_sim : options.strict_abbreviation_sim;
+                    }
                }
            }

@@ -333,11 +341,23 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
            if (have_equal || (use_jaro_winkler && (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)))) {
                log_debug("jaro-winkler, max_sim = %f\n", max_sim);
                t2_score = token_scores2[argmax_sim];
-                total_sim += max_sim * t1_score * t2_score;
+                double jaro_winkler_sim = 0.0;
+                if (have_abbreviation && argmax_sim == last_abbreviation) {
+                    double abbrev_sim = last_abbreviation_sim > max_sim ? last_abbreviation_sim : max_sim;
+                    log_debug("have abbreviation, max(max_sim, last_abbreviation_sim) = %f\n", abbrev_sim);
+                    double max_score = t1_score > t2_score ? t1_score : t2_score;
+                    jaro_winkler_sim = abbrev_sim * max_score * max_score;
+                } else {
+                     jaro_winkler_sim = max_sim * t1_score * t2_score;
+                }
+                total_sim += jaro_winkler_sim;
                matched_tokens++;
            } else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) {
                log_debug("levenshtein, argmin_dist_sim = %f\n", argmin_dist_sim);
                t2_score = token_scores2[argmin_dist];
+                if (have_abbreviation && argmin_dist == last_abbreviation) {
+                    argmin_dist_sim = last_abbreviation_sim > argmin_dist_sim ? last_abbreviation_sim : argmin_dist_sim;
+                }
                total_sim += argmin_dist_sim * t1_score * t2_score;
                matched_tokens++;
            } else if (possible_affine_gap_abbreviations && have_abbreviation) {
@@ -371,7 +391,11 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
                acronym_score += t2_score * t2_score;
            }

-            total_sim += t1_score * sqrt(acronym_score);
+            acronym_score = sqrt(acronym_score);
+
+            double max_acronym_score = t1_score > acronym_score ? t1_score : acronym_score;
+
+            total_sim += max_acronym_score * max_acronym_score;

            log_debug("have_acronym_match\n");
            matched_tokens++;
@@ -420,7 +444,8 @@ return_soft_tfidf_score:
    double norm = double_array_l2_norm(token_scores1, num_tokens1) * double_array_l2_norm(token_scores2, num_tokens2);
    log_debug("total_sim = %f, norm = %f\n", total_sim, norm);

-    return total_sim / norm;
+    double total_sim_norm = total_sim / norm;
+    return total_sim_norm > 1.0 ? 1.0 : total_sim_norm;
 }


--- a/src/soft_tfidf.h
+++ b/src/soft_tfidf.h
@@ -39,6 +39,8 @@ typedef struct soft_tfidf_options {
    size_t damerau_levenshtein_max;
    size_t damerau_levenshtein_min_length;
    bool possible_affine_gap_abbreviations;
+    size_t strict_abbreviation_min_length;
+    double strict_abbreviation_sim;
 } soft_tfidf_options_t;

 soft_tfidf_options_t soft_tfidf_default_options(void);
--- a/src/string_similarity.c
+++ b/src/string_similarity.c
@@ -299,7 +299,9 @@ bool possible_abbreviation_unicode_with_edits(uint32_array *u1_array, uint32_arr
 inline bool possible_abbreviation_unicode(uint32_array *u1_array, uint32_array *u2_array) {
    affine_gap_edits_t edits = affine_gap_distance_unicode(u1_array, u2_array);

-    return possible_abbreviation_unicode_with_edits(u1_array, u2_array, edits);
+    ssize_t prefix_len = unicode_common_prefix(u1_array, u2_array);
+
+    return prefix_len > 0 && possible_abbreviation_unicode_with_edits(u1_array, u2_array, edits);
 }