[similarity] exposing unicode versions of Damerau-Levenshtein and Jaro-Winkler distances

This commit is contained in:
Al
2017-10-28 02:45:48 -04:00
parent 2d6079b06f
commit bc9f11d6e3
2 changed files with 52 additions and 33 deletions

View File

@@ -1,8 +1,7 @@
#include "string_similarity.h" #include "string_similarity.h"
#include "string_utils.h" #include "string_utils.h"
ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) {
size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) {
size_t len1 = u1_array->n; size_t len1 = u1_array->n;
size_t len2 = u2_array->n; size_t len2 = u2_array->n;
@@ -12,6 +11,10 @@ size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array
size_t num_bytes = (len1 + 1) * sizeof(size_t); size_t num_bytes = (len1 + 1) * sizeof(size_t);
size_t *column = malloc(num_bytes); size_t *column = malloc(num_bytes);
if (column == NULL) {
return -1.0;
}
for (size_t y = 1; y <= len1; y++) { for (size_t y = 1; y <= len1; y++) {
column[y] = y; column[y] = y;
} }
@@ -47,26 +50,26 @@ size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array
size_t dist = column[len1]; size_t dist = column[len1];
free(column); free(column);
return dist; return (ssize_t)dist;
} }
ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost) { ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost) {
if (s1 == NULL || s2 == NULL) return -1; if (s1 == NULL || s2 == NULL) return -1;
uint32_array *u1 = unicode_codepoints(s1); uint32_array *u1_array = unicode_codepoints(s1);
if (u1 == NULL) return -1.0; if (u1_array == NULL) return -1.0;
uint32_array *u2 = unicode_codepoints(s2); uint32_array *u2_array = unicode_codepoints(s2);
if (u2 == NULL) { if (u2_array == NULL) {
uint32_array_destroy(u1); uint32_array_destroy(u1_array);
return -1.0; return -1.0;
} }
ssize_t lev = damerau_levenshtein_distance_unicode(u1, u2, replace_cost); ssize_t lev = damerau_levenshtein_distance_unicode(u1_array, u2_array, replace_cost);
uint32_array_destroy(u1); uint32_array_destroy(u1_array);
uint32_array_destroy(u2); uint32_array_destroy(u2_array);
return lev; return lev;
} }
@@ -151,27 +154,6 @@ double jaro_distance(const char *s1, const char *s2) {
return -1.0; return -1.0;
} }
uint32_array *u1 = unicode_codepoints(s1);
if (u1 == NULL) return -1.0;
uint32_array *u2 = unicode_codepoints(s2);
if (u2 == NULL) {
uint32_array_destroy(u1);
return -1.0;
}
double jaro = jaro_distance_unicode(u1, u2);
uint32_array_destroy(u1);
uint32_array_destroy(u2);
return jaro;
}
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold) {
if (s1 == NULL || s2 == NULL) {
return -1.0;
}
uint32_array *u1_array = unicode_codepoints(s1); uint32_array *u1_array = unicode_codepoints(s1);
if (u1_array == NULL) return -1.0; if (u1_array == NULL) return -1.0;
@@ -182,6 +164,13 @@ double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, do
return -1.0; return -1.0;
} }
double jaro = jaro_distance_unicode(u1_array, u2_array);
uint32_array_destroy(u1_array);
uint32_array_destroy(u2_array);
return jaro;
}
double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold) {
double jaro = jaro_distance_unicode(u1_array, u2_array); double jaro = jaro_distance_unicode(u1_array, u2_array);
double j; double j;
@@ -205,12 +194,36 @@ double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, do
jaro_winkler += (1.0 - jaro_winkler) * i * prefix_scale; jaro_winkler += (1.0 - jaro_winkler) * i * prefix_scale;
} }
return jaro_winkler > 1.0 ? 1.0 : jaro_winkler;
}
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold) {
if (s1 == NULL || s2 == NULL) {
return -1.0;
}
uint32_array *u1_array = unicode_codepoints(s1);
if (u1_array == NULL) return -1.0;
uint32_array *u2_array = unicode_codepoints(s2);
if (u2_array == NULL) {
uint32_array_destroy(u1_array);
return -1.0;
}
double jaro_winkler = jaro_winkler_distance_unicode_prefix_threshold(u1_array, u2_array, prefix_scale, bonus_threshold);
uint32_array_destroy(u1_array); uint32_array_destroy(u1_array);
uint32_array_destroy(u2_array); uint32_array_destroy(u2_array);
return jaro_winkler > 1.0 ? 1.0 : jaro_winkler; return jaro_winkler;
} }
inline double jaro_winkler_distance(const char *s1, const char *s2) { inline double jaro_winkler_distance(const char *s1, const char *s2) {
return jaro_winkler_distance_prefix_threshold(s1, s2, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD); return jaro_winkler_distance_prefix_threshold(s1, s2, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD);
} }
inline double jaro_winkler_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) {
return jaro_winkler_distance_unicode_prefix_threshold(u1_array, u2_array, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD);
}

View File

@@ -4,15 +4,21 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include "collections.h"
#define DEFAULT_JARO_WINKLER_PREFIX_SCALE 0.1 #define DEFAULT_JARO_WINKLER_PREFIX_SCALE 0.1
#define DEFAULT_JARO_WINKLER_BONUS_THRESHOLD 0.7 #define DEFAULT_JARO_WINKLER_BONUS_THRESHOLD 0.7
ssize_t damerau_levenshtein_distance(const char *s1, const char *s2); ssize_t damerau_levenshtein_distance(const char *s1, const char *s2);
ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost);
ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost); ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost);
double jaro_distance(const char *s1, const char *s2); double jaro_distance(const char *s1, const char *s2);
double jaro_distance_unicode(uint32_array *u1_array, uint32_array *u2_array);
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold); double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold);
double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold);
double jaro_winkler_distance(const char *s1, const char *s2); double jaro_winkler_distance(const char *s1, const char *s2);
double jaro_winkler_distance_unicode(uint32_array *u1_array, uint32_array *u2_array);
#endif #endif