Initial fork commit
This commit is contained in:
0
scripts/geodata/names/__init__.py
Normal file
0
scripts/geodata/names/__init__.py
Normal file
102
scripts/geodata/names/deduping.py
Normal file
102
scripts/geodata/names/deduping.py
Normal file
@@ -0,0 +1,102 @@
|
||||
from geodata.text.normalize import *
|
||||
from geodata.names.similarity import soft_tfidf_similarity, jaccard_similarity
|
||||
|
||||
from collections import Counter
|
||||
|
||||
|
||||
class NameDeduper(object):
|
||||
'''
|
||||
Base class for deduping geographic entity names e.g. for matching names
|
||||
from different databases (concordances).
|
||||
|
||||
By default uses Soft TFIDF similarity (see geodata.names.similarity)
|
||||
for non-ideographic names and Jaccard similarity with word frequencies
|
||||
for ideographic names.
|
||||
|
||||
See class attributes for options.
|
||||
'''
|
||||
|
||||
stopwords = set()
|
||||
'''Set of words which should not be considered in similarity'''
|
||||
|
||||
discriminative_words = set()
|
||||
'''Set of words which break similarity e.g. North, Heights'''
|
||||
|
||||
discriminative_categories = token_types.NUMERIC_TOKEN_TYPES
|
||||
'''Set of categories which, if not contained in both sets, break similarity'''
|
||||
|
||||
content_categories = token_types.WORD_TOKEN_TYPES | token_types.NUMERIC_TOKEN_TYPES
|
||||
'''Set of categories representing content tokens (default setting ignores punctuation)'''
|
||||
|
||||
replacements = {}
|
||||
'''Dictionary of lowercased token replacements e.g. {u'saint': u'st'}'''
|
||||
|
||||
dupe_threshold = 0.9
|
||||
'''Similarity threshold above which entities are considered dupes'''
|
||||
|
||||
ignore_parentheticals = True
|
||||
'''Whether to ignore parenthetical phrases e.g. "Kangaroo Point (NSW)"'''
|
||||
|
||||
@classmethod
|
||||
def tokenize(cls, s):
|
||||
return normalized_tokens(s)
|
||||
|
||||
@classmethod
|
||||
def content_tokens(cls, s):
|
||||
tokens = cls.tokenize(s)
|
||||
if cls.ignore_parentheticals:
|
||||
tokens = remove_parens(tokens)
|
||||
return [(cls.replacements.get(t, t), c)
|
||||
for t, c in tokens
|
||||
if c in cls.content_categories and
|
||||
t not in cls.stopwords]
|
||||
|
||||
@classmethod
|
||||
def possible_match(cls, tokens1, tokens2):
|
||||
if not cls.discriminative_categories and not cls.discriminative_words:
|
||||
return True
|
||||
|
||||
intersection = set([t for t, c in tokens1]) & set([t for t, c in tokens2])
|
||||
invalid = any((True for t, c in tokens1 + tokens2
|
||||
if t not in intersection and
|
||||
(c in cls.discriminative_categories or t in cls.discriminative_words)
|
||||
))
|
||||
return not invalid
|
||||
|
||||
@classmethod
|
||||
def compare_ideographs(cls, s1, s2):
|
||||
tokens1 = cls.content_tokens(s1)
|
||||
tokens2 = cls.content_tokens(s2)
|
||||
|
||||
if not cls.possible_match(tokens1, tokens2):
|
||||
return 0.0
|
||||
|
||||
tokens1_only = [t for t, c in tokens1]
|
||||
tokens2_only = [t for t, c in tokens2]
|
||||
|
||||
if u''.join(tokens1_only) == u''.join(tokens2_only):
|
||||
return 1.0
|
||||
else:
|
||||
# Many Han/Hangul characters are common, shouldn't use IDF
|
||||
return jaccard_similarity(tokens1_only, tokens2_only)
|
||||
|
||||
@classmethod
|
||||
def compare(cls, s1, s2, idf):
|
||||
tokens1 = cls.content_tokens(s1)
|
||||
tokens2 = cls.content_tokens(s2)
|
||||
|
||||
if not cls.possible_match(tokens1, tokens2):
|
||||
return 0.0
|
||||
|
||||
tokens1_only = [t for t, c in tokens1]
|
||||
tokens2_only = [t for t, c in tokens2]
|
||||
|
||||
# Test exact equality, also handles things like Cabbage Town == Cabbagetown
|
||||
if u''.join(tokens1_only) == u''.join(tokens2_only):
|
||||
return 1.0
|
||||
else:
|
||||
return soft_tfidf_similarity(tokens1_only, tokens2_only, idf)
|
||||
|
||||
@classmethod
|
||||
def is_dupe(cls, sim):
|
||||
return sim >= cls.dupe_threshold
|
||||
119
scripts/geodata/names/normalization.py
Normal file
119
scripts/geodata/names/normalization.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import os
|
||||
import re
|
||||
import six
|
||||
import yaml
|
||||
|
||||
from geodata.encoding import safe_decode
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
AFFIX_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'boundaries', 'names', 'languages')
|
||||
|
||||
|
||||
class NameAffixes(object):
|
||||
def __init__(self, config_dir=AFFIX_CONFIG_DIR):
|
||||
self.config_dir = config_dir
|
||||
|
||||
self.language_prefixes = {}
|
||||
self.language_suffixes = {}
|
||||
|
||||
self.language_prefix_regexes = {}
|
||||
self.language_suffix_regexes = {}
|
||||
|
||||
self.language_prefix_sim_only_regexes = {}
|
||||
self.language_suffix_sim_only_regexes = {}
|
||||
|
||||
for filename in os.listdir(config_dir):
|
||||
if not filename.endswith('.yaml'):
|
||||
continue
|
||||
lang = filename.rsplit('.yaml')[0]
|
||||
|
||||
conf = yaml.load(open(os.path.join(config_dir, filename)))
|
||||
self.add_affixes(lang, conf)
|
||||
|
||||
for country, country_conf in six.iteritems(conf.get('countries', {})):
|
||||
country_lang = (country, lang)
|
||||
self.add_affixes(country_lang, country_conf)
|
||||
|
||||
def add_affixes(self, lang, *confs):
|
||||
prefixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes', [])]
|
||||
prefixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes_no_whitespace', [])]
|
||||
|
||||
self.language_prefixes[lang] = prefixes + prefixes_no_whitespace
|
||||
|
||||
suffixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes', [])]
|
||||
suffixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes_no_whitespace', [])]
|
||||
|
||||
self.language_suffixes[lang] = suffixes + suffixes_no_whitespace
|
||||
|
||||
whitespace_phrase = six.u('[ \-]')
|
||||
|
||||
all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace
|
||||
all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace
|
||||
|
||||
if all_prefixes:
|
||||
prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes))
|
||||
self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)
|
||||
|
||||
if all_suffixes:
|
||||
suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes))
|
||||
self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)
|
||||
|
||||
sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for conf in confs for phrase in conf.get('prefixes_similarity_only', [])]
|
||||
if sim_only_prefixes:
|
||||
sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes))
|
||||
self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE)
|
||||
|
||||
sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for conf in confs for phrase in conf.get('suffixes_similarity_only', [])]
|
||||
if sim_only_suffixes:
|
||||
sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes))
|
||||
|
||||
self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE)
|
||||
|
||||
def replace_prefixes(self, name, lang, country=None, sim_only=False):
|
||||
name = safe_decode(name).strip()
|
||||
|
||||
if not sim_only or lang not in self.language_prefix_sim_only_regexes:
|
||||
d = self.language_prefix_regexes
|
||||
else:
|
||||
d = self.language_prefix_sim_only_regexes
|
||||
|
||||
re = None
|
||||
if country is not None:
|
||||
re = d.get((country, lang))
|
||||
if re:
|
||||
name = re.sub(six.u(''), name)
|
||||
|
||||
re = d.get(lang)
|
||||
|
||||
if not re:
|
||||
return name
|
||||
|
||||
return re.sub(six.u(''), name)
|
||||
|
||||
def replace_suffixes(self, name, lang, country=None, sim_only=False):
|
||||
name = safe_decode(name).strip()
|
||||
|
||||
if not sim_only or lang not in self.language_suffix_sim_only_regexes:
|
||||
d = self.language_suffix_regexes
|
||||
else:
|
||||
d = self.language_suffix_sim_only_regexes
|
||||
|
||||
re = None
|
||||
if country is not None:
|
||||
re = d.get((country, lang))
|
||||
if re:
|
||||
name = re.sub(six.u(''), name)
|
||||
|
||||
re = d.get(lang)
|
||||
|
||||
if not re:
|
||||
return name
|
||||
|
||||
return re.sub(six.u(''), name)
|
||||
|
||||
def replace_affixes(self, name, lang, country=None, sim_only=False):
|
||||
return self.replace_prefixes(self.replace_suffixes(name, lang, country=country, sim_only=sim_only), lang, country=country, sim_only=sim_only)
|
||||
|
||||
name_affixes = NameAffixes()
|
||||
85
scripts/geodata/names/similarity.py
Normal file
85
scripts/geodata/names/similarity.py
Normal file
@@ -0,0 +1,85 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import Levenshtein
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
def ordered_word_count(tokens):
|
||||
counts = OrderedDict()
|
||||
for k in tokens:
|
||||
counts[k] = counts.get(k, 0) + 1
|
||||
return counts
|
||||
|
||||
|
||||
def soft_tfidf_similarity(tokens1, tokens2, idf,
|
||||
sim_func=Levenshtein.jaro_winkler, theta=0.95,
|
||||
common_word_threshold=100):
|
||||
'''
|
||||
Soft TFIDF is a hybrid distance function using both global statistics
|
||||
(inverse document frequency) and local similarity (Jaro-Winkler).
|
||||
|
||||
For each token t1 in the first string, find the token t2 which is most
|
||||
similar to t1 in terms of the local distance function.
|
||||
|
||||
The SoftTFIDF similarity is the dot product of the max token similarities
|
||||
and the cosine similarity of the TF-IDF vectors for all tokens where
|
||||
the max similarity is >= a given threshold theta.
|
||||
|
||||
sim_func should return a number in the range (0, 1) inclusive and theta
|
||||
should be in the same range i.e. this would _not_ work for a metric like
|
||||
basic Levenshtein or Damerau-Levenshtein distance where we'd want the
|
||||
value to be below the threshold. Those metrics can be transformed into
|
||||
a (0, 1) measure.
|
||||
|
||||
@param tokens1: normalized tokens of string 1 (list of strings only)
|
||||
@param tokens2: normalized tokens of string 2 (list of strings only)
|
||||
|
||||
@param idf: IDFIndex from geodata.statistics.tf_idf
|
||||
@param sim_func: similarity function which takes 2 strings and returns
|
||||
a number between 0 and 1
|
||||
@param theta: token-level threshold on sim_func's return value at
|
||||
which point two tokens are considered "close"
|
||||
|
||||
Reference:
|
||||
https://www.cs.cmu.edu/~pradeepr/papers/ijcai03.pdf
|
||||
'''
|
||||
|
||||
token1_counts = ordered_word_count(tokens1)
|
||||
token2_counts = ordered_word_count(tokens2)
|
||||
|
||||
tfidf1 = idf.tfidf_vector(token1_counts)
|
||||
tfidf2 = idf.tfidf_vector(token2_counts)
|
||||
|
||||
total_sim = 0.0
|
||||
|
||||
t1_len = len(token1_counts)
|
||||
t2_len = len(token2_counts)
|
||||
|
||||
if t2_len < t1_len:
|
||||
token1_counts, token2_counts = token2_counts, token1_counts
|
||||
tfidf1, tfidf2 = tfidf2, tfidf1
|
||||
|
||||
for i, t1 in enumerate(token1_counts):
|
||||
sim, j = max([(sim_func(t1, t2), j) for j, t2 in enumerate(token2_counts)])
|
||||
if sim >= theta:
|
||||
total_sim += sim * tfidf1[i] * tfidf2[j]
|
||||
|
||||
return total_sim
|
||||
|
||||
|
||||
def jaccard_similarity(tokens1, tokens2):
|
||||
'''
|
||||
Traditionally Jaccard similarity is defined for two sets:
|
||||
|
||||
Jaccard(A, B) = (A ∩ B) / (A ∪ B)
|
||||
|
||||
Using this for tokens, the similarity of ['a', 'a', 'b'] and ['a', 'b']
|
||||
would be 1.0, which is not ideal for entity name matching.
|
||||
|
||||
In this implementation the cardinality of the set intersections/unions
|
||||
are weighted by term frequencies so Jaccard(['a', 'a', 'b'], ['a', 'b']) = 0.67
|
||||
'''
|
||||
token1_counts = ordered_word_count(tokens1)
|
||||
token2_counts = ordered_word_count(tokens2)
|
||||
|
||||
intersection = sum((min(v, token2_counts[k]) for k, v in token1_counts.iteritems() if k in token2_counts))
|
||||
return float(intersection) / (sum(token1_counts.values()) + sum(token2_counts.values()) - intersection)
|
||||
Reference in New Issue
Block a user