[similarity] Adding an in-memory IDF index for weighted similarities

This commit is contained in:
Al
2015-10-29 12:53:11 -04:00
parent 1c543a5271
commit 5076c0409b
3 changed files with 39 additions and 3 deletions

View File

View File

@@ -0,0 +1,39 @@
import math
from collections import defaultdict
class IDFIndex(object):
finalized = False
def __init__(self):
self.idf_counts = defaultdict(int)
self.N = 0
def update(self, doc):
if self.finalized or not doc:
return
for feature, count in doc.iteritems():
self.idf_counts[feature] += 1
self.N += 1
def prune(self, min_count):
self.idf_counts = {k: count for k, count in self.idf_counts.iteritems() if count >= min_count}
def corpus_frequency(self, key):
return self.idf_counts.get(key, 0)
def tfidf_score(self, key, count=1):
if count < 0:
return 0.0
idf_count = self.idf_counts.get(key, None)
if idf_count is None:
return 0.0
return (math.log(count + 1.0) * (math.log(float(self.N) / idf_count)))
def tfidf_vector(self, tokens):
tf_idf = [self.tfidf_score(t) for t in tokens]
norm = math.sqrt(sum((t ** 2 for t in tf_idf)))
return [t / norm for t in tf_idf]