[similarity] Adding NameDeduper base class for deduping geographic names using the new Soft TFIDF similarity
This commit is contained in:
103
scripts/geodata/names/deduping.py
Normal file
103
scripts/geodata/names/deduping.py
Normal file
@@ -0,0 +1,103 @@
|
||||
from postal.text.normalize import *
|
||||
from geodata.names.similarity import soft_tfidf_similarity, jaccard_similarity, ordered_word_count
|
||||
|
||||
from collections import Counter
|
||||
|
||||
|
||||
class NameDeduper(object):
|
||||
'''
|
||||
Base class for deduping geographic entity names e.g. for matching names
|
||||
from different databases (concordances).
|
||||
|
||||
By default uses Soft TFIDF similarity (see geodata.names.similarity)
|
||||
for non-ideographic names and Jaccard similarity with word frequencies
|
||||
for ideographic names.
|
||||
|
||||
See class attributes for options.
|
||||
'''
|
||||
|
||||
stopwords = set()
|
||||
'''Set of words which should not be considered in similarity'''
|
||||
|
||||
discriminative_words = set()
|
||||
'''Set of words which break similarity e.g. North, Heights'''
|
||||
|
||||
discriminative_categories = token_types.NUMERIC_TOKEN_TYPES
|
||||
'''Set of categories which, if not contained in both sets, break similarity'''
|
||||
|
||||
content_categories = token_types.WORD_TOKEN_TYPES | token_types.NUMERIC_TOKEN_TYPES
|
||||
'''Set of categories representing content tokens (default setting ignores punctuation)'''
|
||||
|
||||
replacements = {}
|
||||
'''Dictionary of lowercased token replacements e.g. {u'saint': u'st'}'''
|
||||
|
||||
dupe_threshold = 0.9
|
||||
'''Similarity threshold above which entities are considered dupes'''
|
||||
|
||||
ignore_parentheticals = True
|
||||
'''Whether to ignore parenthetical phrases e.g. "Kangaroo Point (NSW)"'''
|
||||
|
||||
@classmethod
|
||||
def tokenize(cls, s):
|
||||
token_options = DEFAULT_TOKEN_OPTIONS ^ NORMALIZE_TOKEN_REPLACE_DIGITS
|
||||
return normalized_tokens(s, token_options=token_options)
|
||||
|
||||
@classmethod
|
||||
def content_tokens(cls, s):
|
||||
tokens = cls.tokenize(s)
|
||||
if cls.ignore_parentheticals:
|
||||
tokens = remove_parens(tokens)
|
||||
return [(cls.replacements.get(t, t), c)
|
||||
for t, c in tokens
|
||||
if c in cls.content_categories
|
||||
and t not in cls.stopwords]
|
||||
|
||||
@classmethod
|
||||
def possible_match(cls, tokens1, tokens2):
|
||||
if not cls.discriminative_categories and not cls.discriminative_words:
|
||||
return True
|
||||
|
||||
intersection = set([t for t, c in tokens1]) & set([t for t, c in tokens2])
|
||||
invalid = any((True for t, c in tokens1 + tokens2
|
||||
if t not in intersection and
|
||||
(c in cls.discriminative_categories or t in cls.discriminative_words)
|
||||
))
|
||||
return not invalid
|
||||
|
||||
@classmethod
|
||||
def compare_ideographs(cls, s1, s2):
|
||||
tokens1 = cls.content_tokens(s1)
|
||||
tokens2 = cls.content_tokens(s2)
|
||||
|
||||
if not cls.possible_match(tokens1, tokens2):
|
||||
return max(cls.dupe_threshold - 0.1, 0.0)
|
||||
|
||||
tokens1_only = [t for t, c in tokens1]
|
||||
tokens2_only = [t for t, c in tokens2]
|
||||
|
||||
if u''.join(tokens1_only) == u''.join(tokens2_only):
|
||||
return 1.0
|
||||
else:
|
||||
# Many Han/Hangul characters are common, shouldn't use IDF
|
||||
return jaccard_similarity(tokens1_only, tokens2_only)
|
||||
|
||||
@classmethod
|
||||
def compare(cls, s1, s2, idf):
|
||||
tokens1 = cls.content_tokens(s1)
|
||||
tokens2 = cls.content_tokens(s2)
|
||||
|
||||
if not cls.possible_match(tokens1, tokens2):
|
||||
return max(cls.dupe_threshold - 0.1, 0.0)
|
||||
|
||||
tokens1_only = [t for t, c in tokens1]
|
||||
tokens2_only = [t for t, c in tokens2]
|
||||
|
||||
# Test exact equality, also handles things like Cabbage Town == Cabbagetown
|
||||
if u''.join(tokens1_only) == u''.join(tokens2_only):
|
||||
return 1.0
|
||||
else:
|
||||
return soft_tfidf_similarity(tokens1_only, tokens2_only, idf)
|
||||
|
||||
@classmethod
|
||||
def is_dupe(cls, sim):
|
||||
return sim >= cls.dupe_threshold
|
||||
Reference in New Issue
Block a user