From a5c12960449562b16a6c2e2c3fdbdd694770e205 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Fri, 30 Oct 2015 13:35:11 -0400
Subject: [PATCH] [similarity] Adding Jaccard similarity with word frequencies
 instead of simple sets, better for ideographic scripts (Han, Hangul, etc.) in
 the absence of word segmentation since there may be many high frequency
 characters

---
 scripts/geodata/names/similarity.py | 37 ++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/scripts/geodata/names/similarity.py b/scripts/geodata/names/similarity.py
index 35e0c352..ec5431d7 100644
--- a/scripts/geodata/names/similarity.py
+++ b/scripts/geodata/names/similarity.py
@@ -2,8 +2,15 @@ import Levenshtein
 from collections import OrderedDict
 
 
+def ordered_word_count(tokens):
+    counts = OrderedDict()
+    for k in tokens:
+        counts[k] = counts.get(k, 0) + 1
+    return counts
+
+
 def soft_tfidf_similarity(tokens1, tokens2, idf,
-                          sim_func=Levenshtein.jaro_winkler, theta=0.9,
+                          sim_func=Levenshtein.jaro_winkler, theta=0.95,
                           common_word_threshold=100):
     '''
     Soft TFIDF is a hybrid distance function using both global statistics
@@ -35,13 +42,8 @@ def soft_tfidf_similarity(tokens1, tokens2, idf,
     https://www.cs.cmu.edu/~pradeepr/papers/ijcai03.pdf
     '''
 
-    token1_counts = OrderedDict()
-    for k in tokens1:
-        token1_counts[k] = token1_counts.get(k, 0) + 1
-
-    token2_counts = OrderedDict()
-    for k in tokens2:
-        token2_counts[k] = token2_counts.get(k, 0) + 1
+    token1_counts = ordered_word_count(tokens1)
+    token2_counts = ordered_word_count(tokens2)
 
     tfidf1 = idf.tfidf_vector(token1_counts)
     tfidf2 = idf.tfidf_vector(token2_counts)
@@ -61,3 +63,22 @@ def soft_tfidf_similarity(tokens1, tokens2, idf,
             total_sim += sim * tfidf1[i] * tfidf2[j]
 
     return total_sim
+
+
+def jaccard_similarity(tokens1, tokens2):
+    '''
+    Traditionally Jaccard similarity is defined for two sets:
+
+    Jaccard(A, B) = (A ∩ B) / (A ∪ B)
+
+    Using this for tokens, the similarity of ['a', 'a', 'b'] and ['a', 'b']
+    would be 1.0, which is not ideal for entity name matching.
+
+    In this implementation the cardinality of the set intersections/unions
+    are weighted by term frequencies so Jaccard(['a', 'a', 'b'], ['a', 'b']) = 0.67
+    '''
+    token1_counts = ordered_word_count(tokens1)
+    token2_counts = ordered_word_count(tokens2)
+
+    intersection = sum((min(v, token2_counts[k]) for k, v in token1_counts.iteritems() if k in token2_counts))
+    return float(intersection) / (sum(token1_counts.values()) + sum(token2_counts.values()) - intersection)