From cccc3e9cf5bd35d6e5048a4e25125c7982d5be42 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 30 Oct 2015 02:02:16 -0400 Subject: [PATCH] [similarity] Using Soft-TFIDF for approximate name matching. Soft-TFIDF is a hybrid string distance metric which balances local token similarities (using Jaro-Winkler similarity by default) allowing for slight spelling errors with global TFIDF statistics so that very frequent words don't affect the score as much --- scripts/geodata/names/__init__.py | 0 scripts/geodata/names/similarity.py | 63 ++++++++++++++++++++++++++++ scripts/geodata/statistics/tf_idf.py | 4 +- 3 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 scripts/geodata/names/__init__.py create mode 100644 scripts/geodata/names/similarity.py diff --git a/scripts/geodata/names/__init__.py b/scripts/geodata/names/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/geodata/names/similarity.py b/scripts/geodata/names/similarity.py new file mode 100644 index 00000000..35e0c352 --- /dev/null +++ b/scripts/geodata/names/similarity.py @@ -0,0 +1,63 @@ +import Levenshtein +from collections import OrderedDict + + +def soft_tfidf_similarity(tokens1, tokens2, idf, + sim_func=Levenshtein.jaro_winkler, theta=0.9, + common_word_threshold=100): + ''' + Soft TFIDF is a hybrid distance function using both global statistics + (inverse document frequency) and local similarity (Jaro-Winkler). + + For each token t1 in the first string, find the token t2 which is most + similar to t1 in terms of the local distance function. + + The SoftTFIDF similarity is the dot product of the max token similarities + and the cosine similarity of the TF-IDF vectors for all tokens where + the max similarity is >= a given threshold theta. + + sim_func should return a number in the range (0, 1) inclusive and theta + should be in the same range i.e. this would _not_ work for a metric like + basic Levenshtein or Damerau-Levenshtein distance where we'd want the + value to be below the threshold. Those metrics can be transformed into + a (0, 1) measure. + + @param tokens1: normalized tokens of string 1 (list of strings only) + @param tokens2: normalized tokens of string 2 (list of strings only) + + @param idf: IDFIndex from geodata.statistics.tf_idf + @param sim_func: similarity function which takes 2 strings and returns + a number between 0 and 1 + @param theta: token-level threshold on sim_func's return value at + which point two tokens are considered "close" + + Reference: + https://www.cs.cmu.edu/~pradeepr/papers/ijcai03.pdf + ''' + + token1_counts = OrderedDict() + for k in tokens1: + token1_counts[k] = token1_counts.get(k, 0) + 1 + + token2_counts = OrderedDict() + for k in tokens2: + token2_counts[k] = token2_counts.get(k, 0) + 1 + + tfidf1 = idf.tfidf_vector(token1_counts) + tfidf2 = idf.tfidf_vector(token2_counts) + + total_sim = 0.0 + + t1_len = len(token1_counts) + t2_len = len(token2_counts) + + if t2_len < t1_len: + token1_counts, token2_counts = token2_counts, token1_counts + tfidf1, tfidf2 = tfidf2, tfidf1 + + for i, t1 in enumerate(token1_counts): + sim, j = max([(sim_func(t1, t2), j) for j, t2 in enumerate(token2_counts)]) + if sim >= theta: + total_sim += sim * tfidf1[i] * tfidf2[j] + + return total_sim diff --git a/scripts/geodata/statistics/tf_idf.py b/scripts/geodata/statistics/tf_idf.py index 37f80bf7..a6183d85 100644 --- a/scripts/geodata/statistics/tf_idf.py +++ b/scripts/geodata/statistics/tf_idf.py @@ -33,7 +33,7 @@ class IDFIndex(object): return 0.0 return (math.log(count + 1.0) * (math.log(float(self.N) / idf_count))) - def tfidf_vector(self, tokens): - tf_idf = [self.tfidf_score(t) for t in tokens] + def tfidf_vector(self, token_counts): + tf_idf = [self.tfidf_score(t, count=c) for t, c in token_counts.iteritems()] norm = math.sqrt(sum((t ** 2 for t in tf_idf))) return [t / norm for t in tf_idf]