86 lines
3.1 KiB
Python
86 lines
3.1 KiB
Python
# -*- coding: utf-8 -*-
|
||
import Levenshtein
|
||
from collections import OrderedDict
|
||
|
||
|
||
def ordered_word_count(tokens):
|
||
counts = OrderedDict()
|
||
for k in tokens:
|
||
counts[k] = counts.get(k, 0) + 1
|
||
return counts
|
||
|
||
|
||
def soft_tfidf_similarity(tokens1, tokens2, idf,
|
||
sim_func=Levenshtein.jaro_winkler, theta=0.95,
|
||
common_word_threshold=100):
|
||
'''
|
||
Soft TFIDF is a hybrid distance function using both global statistics
|
||
(inverse document frequency) and local similarity (Jaro-Winkler).
|
||
|
||
For each token t1 in the first string, find the token t2 which is most
|
||
similar to t1 in terms of the local distance function.
|
||
|
||
The SoftTFIDF similarity is the dot product of the max token similarities
|
||
and the cosine similarity of the TF-IDF vectors for all tokens where
|
||
the max similarity is >= a given threshold theta.
|
||
|
||
sim_func should return a number in the range (0, 1) inclusive and theta
|
||
should be in the same range i.e. this would _not_ work for a metric like
|
||
basic Levenshtein or Damerau-Levenshtein distance where we'd want the
|
||
value to be below the threshold. Those metrics can be transformed into
|
||
a (0, 1) measure.
|
||
|
||
@param tokens1: normalized tokens of string 1 (list of strings only)
|
||
@param tokens2: normalized tokens of string 2 (list of strings only)
|
||
|
||
@param idf: IDFIndex from geodata.statistics.tf_idf
|
||
@param sim_func: similarity function which takes 2 strings and returns
|
||
a number between 0 and 1
|
||
@param theta: token-level threshold on sim_func's return value at
|
||
which point two tokens are considered "close"
|
||
|
||
Reference:
|
||
https://www.cs.cmu.edu/~pradeepr/papers/ijcai03.pdf
|
||
'''
|
||
|
||
token1_counts = ordered_word_count(tokens1)
|
||
token2_counts = ordered_word_count(tokens2)
|
||
|
||
tfidf1 = idf.tfidf_vector(token1_counts)
|
||
tfidf2 = idf.tfidf_vector(token2_counts)
|
||
|
||
total_sim = 0.0
|
||
|
||
t1_len = len(token1_counts)
|
||
t2_len = len(token2_counts)
|
||
|
||
if t2_len < t1_len:
|
||
token1_counts, token2_counts = token2_counts, token1_counts
|
||
tfidf1, tfidf2 = tfidf2, tfidf1
|
||
|
||
for i, t1 in enumerate(token1_counts):
|
||
sim, j = max([(sim_func(t1, t2), j) for j, t2 in enumerate(token2_counts)])
|
||
if sim >= theta:
|
||
total_sim += sim * tfidf1[i] * tfidf2[j]
|
||
|
||
return total_sim
|
||
|
||
|
||
def jaccard_similarity(tokens1, tokens2):
|
||
'''
|
||
Traditionally Jaccard similarity is defined for two sets:
|
||
|
||
Jaccard(A, B) = (A ∩ B) / (A ∪ B)
|
||
|
||
Using this for tokens, the similarity of ['a', 'a', 'b'] and ['a', 'b']
|
||
would be 1.0, which is not ideal for entity name matching.
|
||
|
||
In this implementation the cardinality of the set intersections/unions
|
||
are weighted by term frequencies so Jaccard(['a', 'a', 'b'], ['a', 'b']) = 0.67
|
||
'''
|
||
token1_counts = ordered_word_count(tokens1)
|
||
token2_counts = ordered_word_count(tokens2)
|
||
|
||
intersection = sum((min(v, token2_counts[k]) for k, v in token1_counts.iteritems() if k in token2_counts))
|
||
return float(intersection) / (sum(token1_counts.values()) + sum(token2_counts.values()) - intersection)
|