From cccc3e9cf5bd35d6e5048a4e25125c7982d5be42 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Fri, 30 Oct 2015 02:02:16 -0400
Subject: [PATCH] [similarity] Using Soft-TFIDF for approximate name matching.
 Soft-TFIDF is a hybrid string distance metric which balances local token
 similarities (using Jaro-Winkler similarity by default) allowing for slight
 spelling errors with global TFIDF statistics so that very frequent words
 don't affect the score as much

---
 scripts/geodata/names/__init__.py    |  0
 scripts/geodata/names/similarity.py  | 63 ++++++++++++++++++++++++++++
 scripts/geodata/statistics/tf_idf.py |  4 +-
 3 files changed, 65 insertions(+), 2 deletions(-)
 create mode 100644 scripts/geodata/names/__init__.py
 create mode 100644 scripts/geodata/names/similarity.py

diff --git a/scripts/geodata/names/__init__.py b/scripts/geodata/names/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/geodata/names/similarity.py b/scripts/geodata/names/similarity.py
new file mode 100644
index 00000000..35e0c352
--- /dev/null
+++ b/scripts/geodata/names/similarity.py
@@ -0,0 +1,63 @@
+import Levenshtein
+from collections import OrderedDict
+
+
+def soft_tfidf_similarity(tokens1, tokens2, idf,
+                          sim_func=Levenshtein.jaro_winkler, theta=0.9,
+                          common_word_threshold=100):
+    '''
+    Soft TFIDF is a hybrid distance function using both global statistics
+    (inverse document frequency) and local similarity (Jaro-Winkler).
+
+    For each token t1 in the first string, find the token t2 which is most
+    similar to t1 in terms of the local distance function.
+
+    The SoftTFIDF similarity is the dot product of the max token similarities
+    and the cosine similarity of the TF-IDF vectors for all tokens where
+    the max similarity is >= a given threshold theta.
+
+    sim_func should return a number in the range (0, 1) inclusive and theta
+    should be in the same range i.e. this would _not_ work for a metric like
+    basic Levenshtein or Damerau-Levenshtein distance where we'd want the
+    value to be below the threshold. Those metrics can be transformed into
+    a (0, 1) measure.
+
+    @param tokens1: normalized tokens of string 1 (list of strings only)
+    @param tokens2: normalized tokens of string 2 (list of strings only)
+
+    @param idf: IDFIndex from geodata.statistics.tf_idf
+    @param sim_func: similarity function which takes 2 strings and returns
+                     a number between 0 and 1
+    @param theta: token-level threshold on sim_func's return value at
+                  which point two tokens are considered "close"
+
+    Reference:
+    https://www.cs.cmu.edu/~pradeepr/papers/ijcai03.pdf
+    '''
+
+    token1_counts = OrderedDict()
+    for k in tokens1:
+        token1_counts[k] = token1_counts.get(k, 0) + 1
+
+    token2_counts = OrderedDict()
+    for k in tokens2:
+        token2_counts[k] = token2_counts.get(k, 0) + 1
+
+    tfidf1 = idf.tfidf_vector(token1_counts)
+    tfidf2 = idf.tfidf_vector(token2_counts)
+
+    total_sim = 0.0
+
+    t1_len = len(token1_counts)
+    t2_len = len(token2_counts)
+
+    if t2_len < t1_len:
+        token1_counts, token2_counts = token2_counts, token1_counts
+        tfidf1, tfidf2 = tfidf2, tfidf1
+
+    for i, t1 in enumerate(token1_counts):
+        sim, j = max([(sim_func(t1, t2), j) for j, t2 in enumerate(token2_counts)])
+        if sim >= theta:
+            total_sim += sim * tfidf1[i] * tfidf2[j]
+
+    return total_sim
diff --git a/scripts/geodata/statistics/tf_idf.py b/scripts/geodata/statistics/tf_idf.py
index 37f80bf7..a6183d85 100644
--- a/scripts/geodata/statistics/tf_idf.py
+++ b/scripts/geodata/statistics/tf_idf.py
@@ -33,7 +33,7 @@ class IDFIndex(object):
             return 0.0
         return (math.log(count + 1.0) * (math.log(float(self.N) / idf_count)))
 
-    def tfidf_vector(self, tokens):
-        tf_idf = [self.tfidf_score(t) for t in tokens]
+    def tfidf_vector(self, token_counts):
+        tf_idf = [self.tfidf_score(t, count=c) for t, c in token_counts.iteritems()]
         norm = math.sqrt(sum((t ** 2 for t in tf_idf)))
         return [t / norm for t in tf_idf]