From 5076c0409b32e3d646b9390beb6eca3cf5fb7007 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 29 Oct 2015 12:53:11 -0400 Subject: [PATCH] [similarity] Adding an in-memory IDF index for weighted similarities --- scripts/geodata/polygons/index.py | 3 -- scripts/geodata/statistics/__init__.py | 0 scripts/geodata/statistics/tf_idf.py | 39 ++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 scripts/geodata/statistics/__init__.py create mode 100644 scripts/geodata/statistics/tf_idf.py diff --git a/scripts/geodata/polygons/index.py b/scripts/geodata/polygons/index.py index 1662a541..fa58c5a7 100644 --- a/scripts/geodata/polygons/index.py +++ b/scripts/geodata/polygons/index.py @@ -197,19 +197,16 @@ class PolygonIndex(object): polygons.append((feature['properties'], prep(MultiPolygon(polys)))) return polygons - @classmethod def load_index(cls, d, index_name=None): raise NotImplementedError('Children must implement') - @classmethod def load(cls, d, index_name=None, polys_filename=DEFAULT_POLYS_FILENAME): index = cls.load_index(d, index_name=index_name or cls.INDEX_FILENAME) polys = cls.load_polygons(os.path.join(d, polys_filename)) return cls(index=index, polygons=polys, save_dir=d) - def get_candidate_polygons(self, lat, lon): raise NotImplementedError('Children must implement') diff --git a/scripts/geodata/statistics/__init__.py b/scripts/geodata/statistics/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/geodata/statistics/tf_idf.py b/scripts/geodata/statistics/tf_idf.py new file mode 100644 index 00000000..37f80bf7 --- /dev/null +++ b/scripts/geodata/statistics/tf_idf.py @@ -0,0 +1,39 @@ +import math +from collections import defaultdict + + +class IDFIndex(object): + finalized = False + + def __init__(self): + self.idf_counts = defaultdict(int) + self.N = 0 + + def update(self, doc): + if self.finalized or not doc: + return + + for feature, count in doc.iteritems(): + self.idf_counts[feature] += 1 + + self.N += 1 + + def prune(self, min_count): + self.idf_counts = {k: count for k, count in self.idf_counts.iteritems() if count >= min_count} + + def corpus_frequency(self, key): + return self.idf_counts.get(key, 0) + + def tfidf_score(self, key, count=1): + if count < 0: + return 0.0 + + idf_count = self.idf_counts.get(key, None) + if idf_count is None: + return 0.0 + return (math.log(count + 1.0) * (math.log(float(self.N) / idf_count))) + + def tfidf_vector(self, tokens): + tf_idf = [self.tfidf_score(t) for t in tokens] + norm = math.sqrt(sum((t ** 2 for t in tf_idf))) + return [t / norm for t in tf_idf]