[similarity] Using Soft-TFIDF for approximate name matching. Soft-TFIDF is a hybrid string distance metric which balances local token similarities (using Jaro-Winkler similarity by default) allowing for slight spelling errors with global TFIDF statistics so that very frequent words don't affect the score as much
This commit is contained in:
@@ -33,7 +33,7 @@ class IDFIndex(object):
|
||||
return 0.0
|
||||
return (math.log(count + 1.0) * (math.log(float(self.N) / idf_count)))
|
||||
|
||||
def tfidf_vector(self, tokens):
|
||||
tf_idf = [self.tfidf_score(t) for t in tokens]
|
||||
def tfidf_vector(self, token_counts):
|
||||
tf_idf = [self.tfidf_score(t, count=c) for t, c in token_counts.iteritems()]
|
||||
norm = math.sqrt(sum((t ** 2 for t in tf_idf)))
|
||||
return [t / norm for t in tf_idf]
|
||||
|
||||
Reference in New Issue
Block a user