[similarity] Using Soft-TFIDF for approximate name matching. Soft-TFIDF is a hybrid string distance metric which balances local token similarities (using Jaro-Winkler similarity by default) allowing for slight spelling errors with global TFIDF statistics so that very frequent words don't affect the score as much

2015-10-30 02:02:16 -04:00
parent e7f783477f
commit cccc3e9cf5
3 changed files with 65 additions and 2 deletions
--- a/scripts/geodata/statistics/tf_idf.py
+++ b/scripts/geodata/statistics/tf_idf.py
@@ -33,7 +33,7 @@ class IDFIndex(object):
            return 0.0
        return (math.log(count + 1.0) * (math.log(float(self.N) / idf_count)))

-    def tfidf_vector(self, tokens):
-        tf_idf = [self.tfidf_score(t) for t in tokens]
+    def tfidf_vector(self, token_counts):
+        tf_idf = [self.tfidf_score(t, count=c) for t, c in token_counts.iteritems()]
        norm = math.sqrt(sum((t ** 2 for t in tf_idf)))
        return [t / norm for t in tf_idf]