From af5a5c30397ae939e1e9b45f24440eb5c9eda281 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Thu, 25 Jan 2018 16:32:47 -0500
Subject: [PATCH] [dedupe] in the case of abbreviations and acronyms, where we
 use the higher of the two scores, calculate an offset to the norm of the
 other string's scores i.e. sincey we're replacing the score(s) in the
 lower-scoring vector with the higher one in the dot product for the
 numerator, do the same for the L2-norm product in the denominator. This way
 we don't accidentally inflate the similarity value simply because e.g. an
 acronym token was more rare than the same acronym spelled out as multiple
 individual letters (tend to be low-information/common tokens).

---
 src/soft_tfidf.c | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/soft_tfidf.c b/src/soft_tfidf.c
index c732f0f3..bf1ae4c0 100644
--- a/src/soft_tfidf.c
+++ b/src/soft_tfidf.c
@@ -216,6 +216,9 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
 
     bool possible_affine_gap_abbreviations = options.possible_affine_gap_abbreviations;
 
+    double norm1_offset = 0.0;
+    double norm2_offset = 0.0;
+
     size_t matched_tokens = 0;
 
     for (size_t i = 0; i < len1; i++) {
@@ -345,10 +348,21 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
                 if (have_abbreviation && argmax_sim == last_abbreviation) {
                     double abbrev_sim = last_abbreviation_sim > max_sim ? last_abbreviation_sim : max_sim;
                     log_debug("have abbreviation, max(max_sim, last_abbreviation_sim) = %f\n", abbrev_sim);
-                    double max_score = t1_score > t2_score ? t1_score : t2_score;
+                    double max_score = 0.0;
+                    if (t1_score > t2_score || double_equals(t1_score, t2_score)) {
+                        norm2_offset += (t1_score * t1_score) - (t2_score * t2_score);
+                        log_debug("t1_score >= t2_score, norm2_offset = %f\n", norm2_offset);
+                        max_score = t1_score;
+                    } else {
+                        norm1_offset += (t2_score * t2_score) - (t1_score * t1_score);
+                        log_debug("t2_score > t1_score, norm1_offset = %f\n", norm1_offset);
+                        max_score = t2_score;
+                    }
+
                     jaro_winkler_sim = abbrev_sim * max_score * max_score;
                 } else {
-                     jaro_winkler_sim = max_sim * t1_score * t2_score;
+                    jaro_winkler_sim = max_sim * t1_score * t2_score;
+                    log_debug("t1_score = %f, t2_score = %f, jaro_winkler_sim = %f\n", t1_score, t2_score, jaro_winkler_sim);
                 }
                 total_sim += jaro_winkler_sim;
                 matched_tokens++;
@@ -391,9 +405,20 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
                 acronym_score += t2_score * t2_score;
             }
 
-            acronym_score = sqrt(acronym_score);
+            double norm_acronym_score = sqrt(acronym_score);
 
-            double max_acronym_score = t1_score > acronym_score ? t1_score : acronym_score;
+            double max_acronym_score = 0.0;
+            if (t1_score > norm_acronym_score || double_equals(t1_score, norm_acronym_score)) {
+                norm2_offset += (t1_score * t1_score) - acronym_score;
+                log_debug("t1_score >= norm_acronym_score, norm2_offset = %f\n", norm2_offset);
+                max_acronym_score = t1_score;
+            } else {
+                norm1_offset += acronym_score - (t1_score * t1_score);
+                log_debug("norm_acronym_score > t1_score, norm1_offset = %f\n", norm1_offset);
+                max_acronym_score = norm_acronym_score;
+            }
+
+            log_debug("max_acronym_score = %f\n", max_acronym_score);
 
             total_sim += max_acronym_score * max_acronym_score;
 
@@ -441,8 +466,8 @@ return_soft_tfidf_score:
         int64_array_destroy(acronym_memberships_array);
     }
 
-    double norm = double_array_l2_norm(token_scores1, num_tokens1) * double_array_l2_norm(token_scores2, num_tokens2);
-    log_debug("total_sim = %f, norm = %f\n", total_sim, norm);
+    double norm = sqrt(double_array_sum_sq(token_scores1, num_tokens1) + norm1_offset) * sqrt(double_array_sum_sq(token_scores2, num_tokens2) + norm2_offset);
+    log_debug("total_sim = %f, norm1_offset = %f, norm2_offset = %f, norm = %f\n", total_sim, norm1_offset, norm2_offset, norm);
 
     double total_sim_norm = total_sim / norm;
     return total_sim_norm > 1.0 ? 1.0 : total_sim_norm;