[similarity] Using Soft-TFIDF for approximate name matching. Soft-TFIDF is a hybrid string distance metric which balances local token similarities (using Jaro-Winkler similarity by default) allowing for slight spelling errors with global TFIDF statistics so that very frequent words don't affect the score as much

This commit is contained in:
Al
2015-10-30 02:02:16 -04:00
parent e7f783477f
commit cccc3e9cf5
3 changed files with 65 additions and 2 deletions

View File

@@ -33,7 +33,7 @@ class IDFIndex(object):
return 0.0
return (math.log(count + 1.0) * (math.log(float(self.N) / idf_count)))
def tfidf_vector(self, tokens):
tf_idf = [self.tfidf_score(t) for t in tokens]
def tfidf_vector(self, token_counts):
tf_idf = [self.tfidf_score(t, count=c) for t, c in token_counts.iteritems()]
norm = math.sqrt(sum((t ** 2 for t in tf_idf)))
return [t / norm for t in tf_idf]