From 3bdb8c86306a9b155f934b20b1168f8631f3ccba Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 31 Dec 2017 13:22:00 -0500 Subject: [PATCH] [similarity] max out the Jaro-Winkler shared prefix at 4 characters in accordance with Winkler's paper --- src/string_similarity.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/string_similarity.c b/src/string_similarity.c index 61882325..6a16518d 100644 --- a/src/string_similarity.c +++ b/src/string_similarity.c @@ -520,6 +520,8 @@ double jaro_distance(const char *s1, const char *s2) { return jaro; } +#define MAX_JARO_WINKLER_PREFIX 4 + double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold) { double jaro = jaro_distance_unicode(u1_array, u2_array); @@ -533,15 +535,20 @@ double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, ui size_t m = len1 < len2 ? len1 : len2; - size_t i = 0; - for (; i < m; i++) { + size_t shared_prefix = 0; + for (size_t i = 0; i < m; i++) { if (u1[i] != u2[i]) break; + shared_prefix++; + if (shared_prefix > MAX_JARO_WINKLER_PREFIX) { + shared_prefix = MAX_JARO_WINKLER_PREFIX; + break; + } } double jaro_winkler = jaro; if (jaro >= bonus_threshold) { - jaro_winkler += (1.0 - jaro_winkler) * i * prefix_scale; + jaro_winkler += (1.0 - jaro) * shared_prefix * prefix_scale; } return jaro_winkler > 1.0 ? 1.0 : jaro_winkler;