libpostal/scripts/geodata/names/deduping.py

from postal.text.normalize import *
from geodata.names.similarity import soft_tfidf_similarity, jaccard_similarity, ordered_word_count

from collections import Counter


class NameDeduper(object):
    '''
    Base class for deduping geographic entity names e.g. for matching names
    from different databases (concordances).

    By default uses Soft TFIDF similarity (see geodata.names.similarity)
    for non-ideographic names and Jaccard similarity with word frequencies
    for ideographic names.

    See class attributes for options.
    '''

    stopwords = set()
    '''Set of words which should not be considered in similarity'''

    discriminative_words = set()
    '''Set of words which break similarity e.g. North, Heights'''

    discriminative_categories = token_types.NUMERIC_TOKEN_TYPES
    '''Set of categories which, if not contained in both sets, break similarity'''

    content_categories = token_types.WORD_TOKEN_TYPES | token_types.NUMERIC_TOKEN_TYPES
    '''Set of categories representing content tokens (default setting ignores punctuation)'''

    replacements = {}
    '''Dictionary of lowercased token replacements e.g. {u'saint': u'st'}'''

    dupe_threshold = 0.9
    '''Similarity threshold above which entities are considered dupes'''

    ignore_parentheticals = True
    '''Whether to ignore parenthetical phrases e.g. "Kangaroo Point (NSW)"'''

    @classmethod
    def tokenize(cls, s):
        token_options = DEFAULT_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS
        return normalized_tokens(s, token_options=token_options)

    @classmethod
    def content_tokens(cls, s):
        tokens = cls.tokenize(s)
        if cls.ignore_parentheticals:
            tokens = remove_parens(tokens)
        return [(cls.replacements.get(t, t), c)
                for t, c in tokens
                if c in cls.content_categories
                and t not in cls.stopwords]

    @classmethod
    def possible_match(cls, tokens1, tokens2):
        if not cls.discriminative_categories and not cls.discriminative_words:
            return True

        intersection = set([t for t, c in tokens1]) & set([t for t, c in tokens2])
        invalid = any((True for t, c in tokens1 + tokens2
                      if t not in intersection and
                      (c in cls.discriminative_categories or t in cls.discriminative_words)
                       ))
        return not invalid

    @classmethod
    def compare_ideographs(cls, s1, s2):
        tokens1 = cls.content_tokens(s1)
        tokens2 = cls.content_tokens(s2)

        if not cls.possible_match(tokens1, tokens2):
            return max(cls.dupe_threshold - 0.1, 0.0)

        tokens1_only = [t for t, c in tokens1]
        tokens2_only = [t for t, c in tokens2]

        if u''.join(tokens1_only) == u''.join(tokens2_only):
            return 1.0
        else:
            # Many Han/Hangul characters are common, shouldn't use IDF
            return jaccard_similarity(tokens1_only, tokens2_only)

    @classmethod
    def compare(cls, s1, s2, idf):
        tokens1 = cls.content_tokens(s1)
        tokens2 = cls.content_tokens(s2)

        if not cls.possible_match(tokens1, tokens2):
            return max(cls.dupe_threshold - 0.1, 0.0)

        tokens1_only = [t for t, c in tokens1]
        tokens2_only = [t for t, c in tokens2]

        # Test exact equality, also handles things like Cabbage Town == Cabbagetown
        if u''.join(tokens1_only) == u''.join(tokens2_only):
            return 1.0
        else:
            return soft_tfidf_similarity(tokens1_only, tokens2_only, idf)

    @classmethod
    def is_dupe(cls, sim):
        return sim >= cls.dupe_threshold