diff --git a/scripts/geodata/names/similarity.py b/scripts/geodata/names/similarity.py
index 35e0c352..ec5431d7 100644
--- a/scripts/geodata/names/similarity.py
+++ b/scripts/geodata/names/similarity.py
@@ -2,8 +2,15 @@ import Levenshtein
 from collections import OrderedDict
 
 
+def ordered_word_count(tokens):
+    counts = OrderedDict()
+    for k in tokens:
+        counts[k] = counts.get(k, 0) + 1
+    return counts
+
+
 def soft_tfidf_similarity(tokens1, tokens2, idf,
-                          sim_func=Levenshtein.jaro_winkler, theta=0.9,
+                          sim_func=Levenshtein.jaro_winkler, theta=0.95,
                           common_word_threshold=100):
     '''
     Soft TFIDF is a hybrid distance function using both global statistics
@@ -35,13 +42,8 @@ def soft_tfidf_similarity(tokens1, tokens2, idf,
     https://www.cs.cmu.edu/~pradeepr/papers/ijcai03.pdf
     '''
 
-    token1_counts = OrderedDict()
-    for k in tokens1:
-        token1_counts[k] = token1_counts.get(k, 0) + 1
-
-    token2_counts = OrderedDict()
-    for k in tokens2:
-        token2_counts[k] = token2_counts.get(k, 0) + 1
+    token1_counts = ordered_word_count(tokens1)
+    token2_counts = ordered_word_count(tokens2)
 
     tfidf1 = idf.tfidf_vector(token1_counts)
     tfidf2 = idf.tfidf_vector(token2_counts)
@@ -61,3 +63,22 @@ def soft_tfidf_similarity(tokens1, tokens2, idf,
             total_sim += sim * tfidf1[i] * tfidf2[j]
 
     return total_sim
+
+
+def jaccard_similarity(tokens1, tokens2):
+    '''
+    Traditionally Jaccard similarity is defined for two sets:
+
+    Jaccard(A, B) = (A ∩ B) / (A ∪ B)
+
+    Using this for tokens, the similarity of ['a', 'a', 'b'] and ['a', 'b']
+    would be 1.0, which is not ideal for entity name matching.
+
+    In this implementation the cardinality of the set intersections/unions
+    are weighted by term frequencies so Jaccard(['a', 'a', 'b'], ['a', 'b']) = 0.67
+    '''
+    token1_counts = ordered_word_count(tokens1)
+    token2_counts = ordered_word_count(tokens2)
+
+    intersection = sum((min(v, token2_counts[k]) for k, v in token1_counts.iteritems() if k in token2_counts))
+    return float(intersection) / (sum(token1_counts.values()) + sum(token2_counts.values()) - intersection)