Files
libpostal-addrss/scripts/geodata/names/deduping.py
2025-09-06 22:03:29 -04:00

103 lines
3.4 KiB
Python

from geodata.text.normalize import *
from geodata.names.similarity import soft_tfidf_similarity, jaccard_similarity
from collections import Counter
class NameDeduper(object):
'''
Base class for deduping geographic entity names e.g. for matching names
from different databases (concordances).
By default uses Soft TFIDF similarity (see geodata.names.similarity)
for non-ideographic names and Jaccard similarity with word frequencies
for ideographic names.
See class attributes for options.
'''
stopwords = set()
'''Set of words which should not be considered in similarity'''
discriminative_words = set()
'''Set of words which break similarity e.g. North, Heights'''
discriminative_categories = token_types.NUMERIC_TOKEN_TYPES
'''Set of categories which, if not contained in both sets, break similarity'''
content_categories = token_types.WORD_TOKEN_TYPES | token_types.NUMERIC_TOKEN_TYPES
'''Set of categories representing content tokens (default setting ignores punctuation)'''
replacements = {}
'''Dictionary of lowercased token replacements e.g. {u'saint': u'st'}'''
dupe_threshold = 0.9
'''Similarity threshold above which entities are considered dupes'''
ignore_parentheticals = True
'''Whether to ignore parenthetical phrases e.g. "Kangaroo Point (NSW)"'''
@classmethod
def tokenize(cls, s):
return normalized_tokens(s)
@classmethod
def content_tokens(cls, s):
tokens = cls.tokenize(s)
if cls.ignore_parentheticals:
tokens = remove_parens(tokens)
return [(cls.replacements.get(t, t), c)
for t, c in tokens
if c in cls.content_categories and
t not in cls.stopwords]
@classmethod
def possible_match(cls, tokens1, tokens2):
if not cls.discriminative_categories and not cls.discriminative_words:
return True
intersection = set([t for t, c in tokens1]) & set([t for t, c in tokens2])
invalid = any((True for t, c in tokens1 + tokens2
if t not in intersection and
(c in cls.discriminative_categories or t in cls.discriminative_words)
))
return not invalid
@classmethod
def compare_ideographs(cls, s1, s2):
tokens1 = cls.content_tokens(s1)
tokens2 = cls.content_tokens(s2)
if not cls.possible_match(tokens1, tokens2):
return 0.0
tokens1_only = [t for t, c in tokens1]
tokens2_only = [t for t, c in tokens2]
if u''.join(tokens1_only) == u''.join(tokens2_only):
return 1.0
else:
# Many Han/Hangul characters are common, shouldn't use IDF
return jaccard_similarity(tokens1_only, tokens2_only)
@classmethod
def compare(cls, s1, s2, idf):
tokens1 = cls.content_tokens(s1)
tokens2 = cls.content_tokens(s2)
if not cls.possible_match(tokens1, tokens2):
return 0.0
tokens1_only = [t for t, c in tokens1]
tokens2_only = [t for t, c in tokens2]
# Test exact equality, also handles things like Cabbage Town == Cabbagetown
if u''.join(tokens1_only) == u''.join(tokens2_only):
return 1.0
else:
return soft_tfidf_similarity(tokens1_only, tokens2_only, idf)
@classmethod
def is_dupe(cls, sim):
return sim >= cls.dupe_threshold