[similarity] Adding an in-memory IDF index for weighted similarities

This commit is contained in:
Al
2015-10-29 12:53:11 -04:00
parent 1c543a5271
commit 5076c0409b
3 changed files with 39 additions and 3 deletions

View File

@@ -197,19 +197,16 @@ class PolygonIndex(object):
polygons.append((feature['properties'], prep(MultiPolygon(polys))))
return polygons
@classmethod
def load_index(cls, d, index_name=None):
raise NotImplementedError('Children must implement')
@classmethod
def load(cls, d, index_name=None, polys_filename=DEFAULT_POLYS_FILENAME):
index = cls.load_index(d, index_name=index_name or cls.INDEX_FILENAME)
polys = cls.load_polygons(os.path.join(d, polys_filename))
return cls(index=index, polygons=polys, save_dir=d)
def get_candidate_polygons(self, lat, lon):
raise NotImplementedError('Children must implement')

View File

View File

@@ -0,0 +1,39 @@
import math
from collections import defaultdict
class IDFIndex(object):
finalized = False
def __init__(self):
self.idf_counts = defaultdict(int)
self.N = 0
def update(self, doc):
if self.finalized or not doc:
return
for feature, count in doc.iteritems():
self.idf_counts[feature] += 1
self.N += 1
def prune(self, min_count):
self.idf_counts = {k: count for k, count in self.idf_counts.iteritems() if count >= min_count}
def corpus_frequency(self, key):
return self.idf_counts.get(key, 0)
def tfidf_score(self, key, count=1):
if count < 0:
return 0.0
idf_count = self.idf_counts.get(key, None)
if idf_count is None:
return 0.0
return (math.log(count + 1.0) * (math.log(float(self.N) / idf_count)))
def tfidf_vector(self, tokens):
tf_idf = [self.tfidf_score(t) for t in tokens]
norm = math.sqrt(sum((t ** 2 for t in tf_idf)))
return [t / norm for t in tf_idf]