From a5c12960449562b16a6c2e2c3fdbdd694770e205 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 30 Oct 2015 13:35:11 -0400 Subject: [PATCH] [similarity] Adding Jaccard similarity with word frequencies instead of simple sets, better for ideographic scripts (Han, Hangul, etc.) in the absence of word segmentation since there may be many high frequency characters --- scripts/geodata/names/similarity.py | 37 ++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/scripts/geodata/names/similarity.py b/scripts/geodata/names/similarity.py index 35e0c352..ec5431d7 100644 --- a/scripts/geodata/names/similarity.py +++ b/scripts/geodata/names/similarity.py @@ -2,8 +2,15 @@ import Levenshtein from collections import OrderedDict +def ordered_word_count(tokens): + counts = OrderedDict() + for k in tokens: + counts[k] = counts.get(k, 0) + 1 + return counts + + def soft_tfidf_similarity(tokens1, tokens2, idf, - sim_func=Levenshtein.jaro_winkler, theta=0.9, + sim_func=Levenshtein.jaro_winkler, theta=0.95, common_word_threshold=100): ''' Soft TFIDF is a hybrid distance function using both global statistics @@ -35,13 +42,8 @@ def soft_tfidf_similarity(tokens1, tokens2, idf, https://www.cs.cmu.edu/~pradeepr/papers/ijcai03.pdf ''' - token1_counts = OrderedDict() - for k in tokens1: - token1_counts[k] = token1_counts.get(k, 0) + 1 - - token2_counts = OrderedDict() - for k in tokens2: - token2_counts[k] = token2_counts.get(k, 0) + 1 + token1_counts = ordered_word_count(tokens1) + token2_counts = ordered_word_count(tokens2) tfidf1 = idf.tfidf_vector(token1_counts) tfidf2 = idf.tfidf_vector(token2_counts) @@ -61,3 +63,22 @@ def soft_tfidf_similarity(tokens1, tokens2, idf, total_sim += sim * tfidf1[i] * tfidf2[j] return total_sim + + +def jaccard_similarity(tokens1, tokens2): + ''' + Traditionally Jaccard similarity is defined for two sets: + + Jaccard(A, B) = (A ∩ B) / (A ∪ B) + + Using this for tokens, the similarity of ['a', 'a', 'b'] and ['a', 'b'] + would be 1.0, which is not ideal for entity name matching. + + In this implementation the cardinality of the set intersections/unions + are weighted by term frequencies so Jaccard(['a', 'a', 'b'], ['a', 'b']) = 0.67 + ''' + token1_counts = ordered_word_count(tokens1) + token2_counts = ordered_word_count(tokens2) + + intersection = sum((min(v, token2_counts[k]) for k, v in token1_counts.iteritems() if k in token2_counts)) + return float(intersection) / (sum(token1_counts.values()) + sum(token2_counts.values()) - intersection)