[similarity] Adding an in-memory IDF index for weighted similarities

2015-10-29 12:53:11 -04:00
parent 1c543a5271
commit 5076c0409b
3 changed files with 39 additions and 3 deletions
--- a/scripts/geodata/statistics/init.py
+++ b/scripts/geodata/statistics/init.py
--- a/scripts/geodata/statistics/tf_idf.py
+++ b/scripts/geodata/statistics/tf_idf.py
@@ -0,0 +1,39 @@
+import math
+from collections import defaultdict
+
+
+class IDFIndex(object):
+    finalized = False
+
+    def __init__(self):
+        self.idf_counts = defaultdict(int)
+        self.N = 0
+
+    def update(self, doc):
+        if self.finalized or not doc:
+            return
+
+        for feature, count in doc.iteritems():
+            self.idf_counts[feature] += 1
+
+        self.N += 1
+
+    def prune(self, min_count):
+        self.idf_counts = {k: count for k, count in self.idf_counts.iteritems() if count >= min_count}
+
+    def corpus_frequency(self, key):
+        return self.idf_counts.get(key, 0)
+
+    def tfidf_score(self, key, count=1):
+        if count < 0:
+            return 0.0
+
+        idf_count = self.idf_counts.get(key, None)
+        if idf_count is None:
+            return 0.0
+        return (math.log(count + 1.0) * (math.log(float(self.N) / idf_count)))
+
+    def tfidf_vector(self, tokens):
+        tf_idf = [self.tfidf_score(t) for t in tokens]
+        norm = math.sqrt(sum((t ** 2 for t in tf_idf)))
+        return [t / norm for t in tf_idf]