From 7cb85aa23c2f6be0b0d5e57a6872c0bf9739ebae Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Fri, 26 Jan 2018 18:04:45 -0500
Subject: [PATCH] [dedupe] to make soft token similarity order invariant, we
 swap the order so the shorter token sequence comes first. In the case of a
 tie, pick the shorter full string length

---
 src/soft_tfidf.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/soft_tfidf.c b/src/soft_tfidf.c
index bf1ae4c0..1032a9fc 100644
--- a/src/soft_tfidf.c
+++ b/src/soft_tfidf.c
@@ -115,11 +115,20 @@ bool phrases_have_same_canonical(size_t num_tokens1, char **tokens1, size_t num_
     return same_canonical;
 }
 
+static inline size_t sum_token_lengths(size_t num_tokens, char **tokens) {
+    size_t n = 0;
+    for (size_t i = 0; i < num_tokens; i++) {
+        char *token = tokens[i];
+        n += strlen(token);
+    }
+    return n;
+}
+
 
 double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, soft_tfidf_options_t options, size_t *num_matches) {
     if (token_scores1 == NULL || token_scores2 == NULL) return 0.0;
 
-    if (num_tokens1 > num_tokens2) {
+    if (num_tokens1 > num_tokens2 || (num_tokens1 == num_tokens2 && sum_token_lengths(num_tokens1, tokens1) > sum_token_lengths(num_tokens2, tokens2))) {
         double *tmp_scores = token_scores1;
         token_scores1 = token_scores2;
         token_scores2 = tmp_scores;