[similarity] Adding NameDeduper base class for deduping geographic names using the new Soft TFIDF similarity

2015-10-30 15:56:23 -04:00
parent a5c1296044
commit a38624ba59
1 changed files with 103 additions and 0 deletions
--- a/scripts/geodata/names/deduping.py
+++ b/scripts/geodata/names/deduping.py
@@ -0,0 +1,103 @@
+from postal.text.normalize import *
+from geodata.names.similarity import soft_tfidf_similarity, jaccard_similarity, ordered_word_count
+
+from collections import Counter
+
+
+class NameDeduper(object):
+    '''
+    Base class for deduping geographic entity names e.g. for matching names
+    from different databases (concordances).
+
+    By default uses Soft TFIDF similarity (see geodata.names.similarity)
+    for non-ideographic names and Jaccard similarity with word frequencies
+    for ideographic names.
+
+    See class attributes for options.
+    '''
+
+    stopwords = set()
+    '''Set of words which should not be considered in similarity'''
+
+    discriminative_words = set()
+    '''Set of words which break similarity e.g. North, Heights'''
+
+    discriminative_categories = token_types.NUMERIC_TOKEN_TYPES
+    '''Set of categories which, if not contained in both sets, break similarity'''
+
+    content_categories = token_types.WORD_TOKEN_TYPES | token_types.NUMERIC_TOKEN_TYPES
+    '''Set of categories representing content tokens (default setting ignores punctuation)'''
+
+    replacements = {}
+    '''Dictionary of lowercased token replacements e.g. {u'saint': u'st'}'''
+
+    dupe_threshold = 0.9
+    '''Similarity threshold above which entities are considered dupes'''
+
+    ignore_parentheticals = True
+    '''Whether to ignore parenthetical phrases e.g. "Kangaroo Point (NSW)"'''
+
+    @classmethod
+    def tokenize(cls, s):
+        token_options = DEFAULT_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS
+        return normalized_tokens(s, token_options=token_options)
+
+    @classmethod
+    def content_tokens(cls, s):
+        tokens = cls.tokenize(s)
+        if cls.ignore_parentheticals:
+            tokens = remove_parens(tokens)
+        return [(cls.replacements.get(t, t), c)
+                for t, c in tokens
+                if c in cls.content_categories
+                and t not in cls.stopwords]
+
+    @classmethod
+    def possible_match(cls, tokens1, tokens2):
+        if not cls.discriminative_categories and not cls.discriminative_words:
+            return True
+
+        intersection = set([t for t, c in tokens1]) & set([t for t, c in tokens2])
+        invalid = any((True for t, c in tokens1 + tokens2
+                      if t not in intersection and
+                      (c in cls.discriminative_categories or t in cls.discriminative_words)
+                       ))
+        return not invalid
+
+    @classmethod
+    def compare_ideographs(cls, s1, s2):
+        tokens1 = cls.content_tokens(s1)
+        tokens2 = cls.content_tokens(s2)
+
+        if not cls.possible_match(tokens1, tokens2):
+            return max(cls.dupe_threshold - 0.1, 0.0)
+
+        tokens1_only = [t for t, c in tokens1]
+        tokens2_only = [t for t, c in tokens2]
+
+        if u''.join(tokens1_only) == u''.join(tokens2_only):
+            return 1.0
+        else:
+            # Many Han/Hangul characters are common, shouldn't use IDF
+            return jaccard_similarity(tokens1_only, tokens2_only)
+
+    @classmethod
+    def compare(cls, s1, s2, idf):
+        tokens1 = cls.content_tokens(s1)
+        tokens2 = cls.content_tokens(s2)
+
+        if not cls.possible_match(tokens1, tokens2):
+            return max(cls.dupe_threshold - 0.1, 0.0)
+
+        tokens1_only = [t for t, c in tokens1]
+        tokens2_only = [t for t, c in tokens2]
+
+        # Test exact equality, also handles things like Cabbage Town == Cabbagetown
+        if u''.join(tokens1_only) == u''.join(tokens2_only):
+            return 1.0
+        else:
+            return soft_tfidf_similarity(tokens1_only, tokens2_only, idf)
+
+    @classmethod
+    def is_dupe(cls, sim):
+        return sim >= cls.dupe_threshold