From c4aaee7dbfaf4d4c2eb565d768a9c54b6d52eea6 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Tue, 23 Jan 2018 01:20:14 -0500
Subject: [PATCH] [dedupe/similarity] also utilizing the L2 norm in similarity
 when acronyms are detected. Similarity in this case should be the acronym
 token's score * the L2 norm of the expanded tokens' scores in the longer
 string

---
 src/soft_tfidf.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/soft_tfidf.c b/src/soft_tfidf.c
index ca3a1646..a119180b 100644
--- a/src/soft_tfidf.c
+++ b/src/soft_tfidf.c
@@ -362,11 +362,14 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
             matched_tokens += p1.len;
             log_debug("have_phrase_match\n");
         } else {
+            double acronym_score = 0.0;
             for (size_t p = acronym_phrase.start; p < acronym_phrase.start + acronym_phrase.len; p++) {
                 t2_score = token_scores2[p];
-                total_sim += t1_score * t2_score;
+                acronym_score += t2_score * t2_score;
             }
 
+            total_sim += t1_score * sqrt(acronym_score);
+
             log_debug("have_acronym_match\n");
             matched_tokens++;
         }