[similarity] max out the Jaro-Winkler shared prefix at 4 characters in accordance with Winkler's paper
This commit is contained in:
@@ -520,6 +520,8 @@ double jaro_distance(const char *s1, const char *s2) {
|
|||||||
return jaro;
|
return jaro;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MAX_JARO_WINKLER_PREFIX 4
|
||||||
|
|
||||||
double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold) {
|
double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold) {
|
||||||
double jaro = jaro_distance_unicode(u1_array, u2_array);
|
double jaro = jaro_distance_unicode(u1_array, u2_array);
|
||||||
|
|
||||||
@@ -533,15 +535,20 @@ double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, ui
|
|||||||
|
|
||||||
size_t m = len1 < len2 ? len1 : len2;
|
size_t m = len1 < len2 ? len1 : len2;
|
||||||
|
|
||||||
size_t i = 0;
|
size_t shared_prefix = 0;
|
||||||
for (; i < m; i++) {
|
for (size_t i = 0; i < m; i++) {
|
||||||
if (u1[i] != u2[i]) break;
|
if (u1[i] != u2[i]) break;
|
||||||
|
shared_prefix++;
|
||||||
|
if (shared_prefix > MAX_JARO_WINKLER_PREFIX) {
|
||||||
|
shared_prefix = MAX_JARO_WINKLER_PREFIX;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
double jaro_winkler = jaro;
|
double jaro_winkler = jaro;
|
||||||
|
|
||||||
if (jaro >= bonus_threshold) {
|
if (jaro >= bonus_threshold) {
|
||||||
jaro_winkler += (1.0 - jaro_winkler) * i * prefix_scale;
|
jaro_winkler += (1.0 - jaro) * shared_prefix * prefix_scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
return jaro_winkler > 1.0 ? 1.0 : jaro_winkler;
|
return jaro_winkler > 1.0 ? 1.0 : jaro_winkler;
|
||||||
|
|||||||
Reference in New Issue
Block a user