From 5076c0409b32e3d646b9390beb6eca3cf5fb7007 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Thu, 29 Oct 2015 12:53:11 -0400
Subject: [PATCH] [similarity] Adding an in-memory IDF index for weighted
 similarities

---
 scripts/geodata/polygons/index.py      |  3 --
 scripts/geodata/statistics/__init__.py |  0
 scripts/geodata/statistics/tf_idf.py   | 39 ++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 3 deletions(-)
 create mode 100644 scripts/geodata/statistics/__init__.py
 create mode 100644 scripts/geodata/statistics/tf_idf.py

diff --git a/scripts/geodata/polygons/index.py b/scripts/geodata/polygons/index.py
index 1662a541..fa58c5a7 100644
--- a/scripts/geodata/polygons/index.py
+++ b/scripts/geodata/polygons/index.py
@@ -197,19 +197,16 @@ class PolygonIndex(object):
                 polygons.append((feature['properties'], prep(MultiPolygon(polys))))
         return polygons
 
-
     @classmethod
     def load_index(cls, d, index_name=None):
         raise NotImplementedError('Children must implement')
 
-
     @classmethod
     def load(cls, d, index_name=None, polys_filename=DEFAULT_POLYS_FILENAME):
         index = cls.load_index(d, index_name=index_name or cls.INDEX_FILENAME)
         polys = cls.load_polygons(os.path.join(d, polys_filename))
         return cls(index=index, polygons=polys, save_dir=d)
 
-
     def get_candidate_polygons(self, lat, lon):
         raise NotImplementedError('Children must implement')
 
diff --git a/scripts/geodata/statistics/__init__.py b/scripts/geodata/statistics/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/geodata/statistics/tf_idf.py b/scripts/geodata/statistics/tf_idf.py
new file mode 100644
index 00000000..37f80bf7
--- /dev/null
+++ b/scripts/geodata/statistics/tf_idf.py
@@ -0,0 +1,39 @@
+import math
+from collections import defaultdict
+
+
+class IDFIndex(object):
+    finalized = False
+
+    def __init__(self):
+        self.idf_counts = defaultdict(int)
+        self.N = 0
+
+    def update(self, doc):
+        if self.finalized or not doc:
+            return
+
+        for feature, count in doc.iteritems():
+            self.idf_counts[feature] += 1
+
+        self.N += 1
+
+    def prune(self, min_count):
+        self.idf_counts = {k: count for k, count in self.idf_counts.iteritems() if count >= min_count}
+
+    def corpus_frequency(self, key):
+        return self.idf_counts.get(key, 0)
+
+    def tfidf_score(self, key, count=1):
+        if count < 0:
+            return 0.0
+
+        idf_count = self.idf_counts.get(key, None)
+        if idf_count is None:
+            return 0.0
+        return (math.log(count + 1.0) * (math.log(float(self.N) / idf_count)))
+
+    def tfidf_vector(self, tokens):
+        tf_idf = [self.tfidf_score(t) for t in tokens]
+        norm = math.sqrt(sum((t ** 2 for t in tf_idf)))
+        return [t / norm for t in tf_idf]