[similarity] Adding an in-memory IDF index for weighted similarities
This commit is contained in:
@@ -197,19 +197,16 @@ class PolygonIndex(object):
|
||||
polygons.append((feature['properties'], prep(MultiPolygon(polys))))
|
||||
return polygons
|
||||
|
||||
|
||||
@classmethod
|
||||
def load_index(cls, d, index_name=None):
|
||||
raise NotImplementedError('Children must implement')
|
||||
|
||||
|
||||
@classmethod
|
||||
def load(cls, d, index_name=None, polys_filename=DEFAULT_POLYS_FILENAME):
|
||||
index = cls.load_index(d, index_name=index_name or cls.INDEX_FILENAME)
|
||||
polys = cls.load_polygons(os.path.join(d, polys_filename))
|
||||
return cls(index=index, polygons=polys, save_dir=d)
|
||||
|
||||
|
||||
def get_candidate_polygons(self, lat, lon):
|
||||
raise NotImplementedError('Children must implement')
|
||||
|
||||
|
||||
0
scripts/geodata/statistics/__init__.py
Normal file
0
scripts/geodata/statistics/__init__.py
Normal file
39
scripts/geodata/statistics/tf_idf.py
Normal file
39
scripts/geodata/statistics/tf_idf.py
Normal file
@@ -0,0 +1,39 @@
|
||||
import math
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
class IDFIndex(object):
|
||||
finalized = False
|
||||
|
||||
def __init__(self):
|
||||
self.idf_counts = defaultdict(int)
|
||||
self.N = 0
|
||||
|
||||
def update(self, doc):
|
||||
if self.finalized or not doc:
|
||||
return
|
||||
|
||||
for feature, count in doc.iteritems():
|
||||
self.idf_counts[feature] += 1
|
||||
|
||||
self.N += 1
|
||||
|
||||
def prune(self, min_count):
|
||||
self.idf_counts = {k: count for k, count in self.idf_counts.iteritems() if count >= min_count}
|
||||
|
||||
def corpus_frequency(self, key):
|
||||
return self.idf_counts.get(key, 0)
|
||||
|
||||
def tfidf_score(self, key, count=1):
|
||||
if count < 0:
|
||||
return 0.0
|
||||
|
||||
idf_count = self.idf_counts.get(key, None)
|
||||
if idf_count is None:
|
||||
return 0.0
|
||||
return (math.log(count + 1.0) * (math.log(float(self.N) / idf_count)))
|
||||
|
||||
def tfidf_vector(self, tokens):
|
||||
tf_idf = [self.tfidf_score(t) for t in tokens]
|
||||
norm = math.sqrt(sum((t ** 2 for t in tf_idf)))
|
||||
return [t / norm for t in tf_idf]
|
||||
Reference in New Issue
Block a user