From 4aeb54905428f12ff21fa2defba7542e520d9744 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Fri, 26 Jan 2018 01:20:35 -0500
Subject: [PATCH] [dedupe] with some term weighting schemes (especially
 information gain which will soon be the default in the lieu project), single
 letters may have very low weights such that they will be discarded, which can
 lead to false positives for things like "A & B" vs. "B & C", so add a simple
 heuristic to simply demote likely dupes to needs review when there's a
 positive symmetric difference (or whatever the set theory term is for when A
 - B and B - A are both non-empty)

---
 src/dedupe.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/src/dedupe.c b/src/dedupe.c
index bf05fc79..50d33797 100644
--- a/src/dedupe.c
+++ b/src/dedupe.c
@@ -227,6 +227,70 @@ exit_destroy_places:
 
 }
 
+static khash_t(int_set) *single_letters_set(size_t num_tokens, char **tokens) {
+    khash_t(int_set) *letters = NULL;
+    for (size_t i = 0; i < num_tokens; i++) {
+        char *token = tokens[i];
+        size_t len = strlen(token);
+
+        uint8_t *ptr = (uint8_t *)token;
+        int32_t ch;
+        ssize_t char_len;
+        char_len = utf8proc_iterate(ptr, len, &ch);
+        if (char_len == len && utf8_is_letter(utf8proc_category(ch))) {
+            if (letters == NULL) {
+                letters = kh_init(int_set);
+            }
+            int ret = 0;
+            kh_put(int_set, letters, ch, &ret);
+            if (ret < 0) {
+                kh_destroy(int_set, letters);
+                return NULL;
+            }
+        }
+    }
+    return letters;
+}
+
+
+static bool have_symmetric_difference_in_single_letters(size_t num_tokens1, char **tokens1, size_t num_tokens2, char **tokens2) {
+    khash_t(int_set) *letters1 = single_letters_set(num_tokens1, tokens1);
+    khash_t(int_set) *letters2 = single_letters_set(num_tokens2, tokens2);
+
+    bool disjoint = false;
+    if (letters1 != NULL && letters2 != NULL) {
+        int32_t ch;
+        size_t num_missing1 = 0;
+        khiter_t k;
+        kh_foreach_key(letters1, ch, {
+            k = kh_get(int_set, letters2, ch);
+            if (k == kh_end(letters2)) {
+                num_missing1++;
+            }
+        });
+
+        size_t num_missing2 = 0;
+        kh_foreach_key(letters2, ch, {
+            k = kh_get(int_set, letters1, ch);
+            if (k == kh_end(letters1)) {
+                num_missing2++;
+            }
+        });
+
+        disjoint = num_missing1 > 0 && num_missing2 > 0;
+    }
+
+    if (letters1 != NULL) {
+        kh_destroy(int_set, letters1);
+    }
+
+    if (letters2 != NULL) {
+        kh_destroy(int_set, letters2);
+    }
+
+    return disjoint;
+}
+
 char *joined_string_and_tokens_from_strings(char **strings, size_t num_strings, token_array *tokens) {
     if (tokens == NULL || strings == NULL || num_strings == 0) return NULL;
     token_array_clear(tokens);
@@ -351,9 +415,17 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char *
     if (dupe_status == LIBPOSTAL_NON_DUPLICATE) {
         if (max_sim > options.likely_dupe_threshold || double_equals(max_sim, options.likely_dupe_threshold)) {
             dupe_status = LIBPOSTAL_LIKELY_DUPLICATE;
+
+            // Make sure we're not calling "A & B Jewelry" a duplicate of "B & C Jewelry"
+            // simply because single letters tend to be low-information. In this case, demote
+            // the document to needs review as a precaution
+            if (have_symmetric_difference_in_single_letters(num_tokens1, tokens1, num_tokens2, tokens2)) {
+                dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW;
+            }
         } else if (max_sim > options.needs_review_threshold || double_equals(max_sim, options.needs_review_threshold)) {
             dupe_status = LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW;
         }
+
     }
 
     if (phrases1 != NULL) {