Initial fork commit

This commit is contained in:
2025-09-06 22:03:29 -04:00
commit 2d238cd339
1748 changed files with 932506 additions and 0 deletions

View File

View File

@@ -0,0 +1,102 @@
from geodata.text.normalize import *
from geodata.names.similarity import soft_tfidf_similarity, jaccard_similarity
from collections import Counter
class NameDeduper(object):
'''
Base class for deduping geographic entity names e.g. for matching names
from different databases (concordances).
By default uses Soft TFIDF similarity (see geodata.names.similarity)
for non-ideographic names and Jaccard similarity with word frequencies
for ideographic names.
See class attributes for options.
'''
stopwords = set()
'''Set of words which should not be considered in similarity'''
discriminative_words = set()
'''Set of words which break similarity e.g. North, Heights'''
discriminative_categories = token_types.NUMERIC_TOKEN_TYPES
'''Set of categories which, if not contained in both sets, break similarity'''
content_categories = token_types.WORD_TOKEN_TYPES | token_types.NUMERIC_TOKEN_TYPES
'''Set of categories representing content tokens (default setting ignores punctuation)'''
replacements = {}
'''Dictionary of lowercased token replacements e.g. {u'saint': u'st'}'''
dupe_threshold = 0.9
'''Similarity threshold above which entities are considered dupes'''
ignore_parentheticals = True
'''Whether to ignore parenthetical phrases e.g. "Kangaroo Point (NSW)"'''
@classmethod
def tokenize(cls, s):
return normalized_tokens(s)
@classmethod
def content_tokens(cls, s):
tokens = cls.tokenize(s)
if cls.ignore_parentheticals:
tokens = remove_parens(tokens)
return [(cls.replacements.get(t, t), c)
for t, c in tokens
if c in cls.content_categories and
t not in cls.stopwords]
@classmethod
def possible_match(cls, tokens1, tokens2):
if not cls.discriminative_categories and not cls.discriminative_words:
return True
intersection = set([t for t, c in tokens1]) & set([t for t, c in tokens2])
invalid = any((True for t, c in tokens1 + tokens2
if t not in intersection and
(c in cls.discriminative_categories or t in cls.discriminative_words)
))
return not invalid
@classmethod
def compare_ideographs(cls, s1, s2):
tokens1 = cls.content_tokens(s1)
tokens2 = cls.content_tokens(s2)
if not cls.possible_match(tokens1, tokens2):
return 0.0
tokens1_only = [t for t, c in tokens1]
tokens2_only = [t for t, c in tokens2]
if u''.join(tokens1_only) == u''.join(tokens2_only):
return 1.0
else:
# Many Han/Hangul characters are common, shouldn't use IDF
return jaccard_similarity(tokens1_only, tokens2_only)
@classmethod
def compare(cls, s1, s2, idf):
tokens1 = cls.content_tokens(s1)
tokens2 = cls.content_tokens(s2)
if not cls.possible_match(tokens1, tokens2):
return 0.0
tokens1_only = [t for t, c in tokens1]
tokens2_only = [t for t, c in tokens2]
# Test exact equality, also handles things like Cabbage Town == Cabbagetown
if u''.join(tokens1_only) == u''.join(tokens2_only):
return 1.0
else:
return soft_tfidf_similarity(tokens1_only, tokens2_only, idf)
@classmethod
def is_dupe(cls, sim):
return sim >= cls.dupe_threshold

View File

@@ -0,0 +1,119 @@
import os
import re
import six
import yaml
from geodata.encoding import safe_decode
this_dir = os.path.realpath(os.path.dirname(__file__))
AFFIX_CONFIG_DIR = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
'resources', 'boundaries', 'names', 'languages')
class NameAffixes(object):
def __init__(self, config_dir=AFFIX_CONFIG_DIR):
self.config_dir = config_dir
self.language_prefixes = {}
self.language_suffixes = {}
self.language_prefix_regexes = {}
self.language_suffix_regexes = {}
self.language_prefix_sim_only_regexes = {}
self.language_suffix_sim_only_regexes = {}
for filename in os.listdir(config_dir):
if not filename.endswith('.yaml'):
continue
lang = filename.rsplit('.yaml')[0]
conf = yaml.load(open(os.path.join(config_dir, filename)))
self.add_affixes(lang, conf)
for country, country_conf in six.iteritems(conf.get('countries', {})):
country_lang = (country, lang)
self.add_affixes(country_lang, country_conf)
def add_affixes(self, lang, *confs):
prefixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes', [])]
prefixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('prefixes_no_whitespace', [])]
self.language_prefixes[lang] = prefixes + prefixes_no_whitespace
suffixes = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes', [])]
suffixes_no_whitespace = [safe_decode(phrase).lower() for conf in confs for phrase in conf.get('suffixes_no_whitespace', [])]
self.language_suffixes[lang] = suffixes + suffixes_no_whitespace
whitespace_phrase = six.u('[ \-]')
all_prefixes = [six.u('{}{}').format(s, whitespace_phrase) for s in prefixes] + prefixes_no_whitespace
all_suffixes = [six.u('{}{}').format(whitespace_phrase, s) for s in suffixes] + suffixes_no_whitespace
if all_prefixes:
prefix_regex = six.u('^(?:{})').format(six.u('|').join(all_prefixes))
self.language_prefix_regexes[lang] = re.compile(prefix_regex, re.I | re.UNICODE)
if all_suffixes:
suffix_regex = six.u('(?:{})$').format(six.u('|').join(all_suffixes))
self.language_suffix_regexes[lang] = re.compile(suffix_regex, re.I | re.UNICODE)
sim_only_prefixes = [six.u('{}{}').format(safe_decode(phrase.lower()), whitespace_phrase) for conf in confs for phrase in conf.get('prefixes_similarity_only', [])]
if sim_only_prefixes:
sim_only_prefix_regex = six.u('^(?:{})').format(six.u('|').join(sim_only_prefixes + all_prefixes))
self.language_prefix_sim_only_regexes[lang] = re.compile(sim_only_prefix_regex, re.I | re.UNICODE)
sim_only_suffixes = [six.u('(?:{})$').format(whitespace_phrase, safe_decode(phrase.lower())) for conf in confs for phrase in conf.get('suffixes_similarity_only', [])]
if sim_only_suffixes:
sim_only_suffix_regex = six.u('(?:{})$').format(six.u('|').join(sim_only_suffixes + all_suffixes))
self.language_suffix_sim_only_regexes[lang] = re.compile(sim_only_suffix_regex, re.I | re.UNICODE)
def replace_prefixes(self, name, lang, country=None, sim_only=False):
name = safe_decode(name).strip()
if not sim_only or lang not in self.language_prefix_sim_only_regexes:
d = self.language_prefix_regexes
else:
d = self.language_prefix_sim_only_regexes
re = None
if country is not None:
re = d.get((country, lang))
if re:
name = re.sub(six.u(''), name)
re = d.get(lang)
if not re:
return name
return re.sub(six.u(''), name)
def replace_suffixes(self, name, lang, country=None, sim_only=False):
name = safe_decode(name).strip()
if not sim_only or lang not in self.language_suffix_sim_only_regexes:
d = self.language_suffix_regexes
else:
d = self.language_suffix_sim_only_regexes
re = None
if country is not None:
re = d.get((country, lang))
if re:
name = re.sub(six.u(''), name)
re = d.get(lang)
if not re:
return name
return re.sub(six.u(''), name)
def replace_affixes(self, name, lang, country=None, sim_only=False):
return self.replace_prefixes(self.replace_suffixes(name, lang, country=country, sim_only=sim_only), lang, country=country, sim_only=sim_only)
name_affixes = NameAffixes()

View File

@@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
import Levenshtein
from collections import OrderedDict
def ordered_word_count(tokens):
counts = OrderedDict()
for k in tokens:
counts[k] = counts.get(k, 0) + 1
return counts
def soft_tfidf_similarity(tokens1, tokens2, idf,
sim_func=Levenshtein.jaro_winkler, theta=0.95,
common_word_threshold=100):
'''
Soft TFIDF is a hybrid distance function using both global statistics
(inverse document frequency) and local similarity (Jaro-Winkler).
For each token t1 in the first string, find the token t2 which is most
similar to t1 in terms of the local distance function.
The SoftTFIDF similarity is the dot product of the max token similarities
and the cosine similarity of the TF-IDF vectors for all tokens where
the max similarity is >= a given threshold theta.
sim_func should return a number in the range (0, 1) inclusive and theta
should be in the same range i.e. this would _not_ work for a metric like
basic Levenshtein or Damerau-Levenshtein distance where we'd want the
value to be below the threshold. Those metrics can be transformed into
a (0, 1) measure.
@param tokens1: normalized tokens of string 1 (list of strings only)
@param tokens2: normalized tokens of string 2 (list of strings only)
@param idf: IDFIndex from geodata.statistics.tf_idf
@param sim_func: similarity function which takes 2 strings and returns
a number between 0 and 1
@param theta: token-level threshold on sim_func's return value at
which point two tokens are considered "close"
Reference:
https://www.cs.cmu.edu/~pradeepr/papers/ijcai03.pdf
'''
token1_counts = ordered_word_count(tokens1)
token2_counts = ordered_word_count(tokens2)
tfidf1 = idf.tfidf_vector(token1_counts)
tfidf2 = idf.tfidf_vector(token2_counts)
total_sim = 0.0
t1_len = len(token1_counts)
t2_len = len(token2_counts)
if t2_len < t1_len:
token1_counts, token2_counts = token2_counts, token1_counts
tfidf1, tfidf2 = tfidf2, tfidf1
for i, t1 in enumerate(token1_counts):
sim, j = max([(sim_func(t1, t2), j) for j, t2 in enumerate(token2_counts)])
if sim >= theta:
total_sim += sim * tfidf1[i] * tfidf2[j]
return total_sim
def jaccard_similarity(tokens1, tokens2):
'''
Traditionally Jaccard similarity is defined for two sets:
Jaccard(A, B) = (A ∩ B) / (A B)
Using this for tokens, the similarity of ['a', 'a', 'b'] and ['a', 'b']
would be 1.0, which is not ideal for entity name matching.
In this implementation the cardinality of the set intersections/unions
are weighted by term frequencies so Jaccard(['a', 'a', 'b'], ['a', 'b']) = 0.67
'''
token1_counts = ordered_word_count(tokens1)
token2_counts = ordered_word_count(tokens2)
intersection = sum((min(v, token2_counts[k]) for k, v in token1_counts.iteritems() if k in token2_counts))
return float(intersection) / (sum(token1_counts.values()) + sum(token2_counts.values()) - intersection)