[similarity] exposing unicode versions of Damerau-Levenshtein and Jaro-Winkler distances

This commit is contained in:
Al
2017-10-28 02:45:48 -04:00
parent 2d6079b06f
commit bc9f11d6e3
2 changed files with 52 additions and 33 deletions

View File

@@ -1,8 +1,7 @@
#include "string_similarity.h"
#include "string_utils.h"
size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) {
ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) {
size_t len1 = u1_array->n;
size_t len2 = u2_array->n;
@@ -12,6 +11,10 @@ size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array
size_t num_bytes = (len1 + 1) * sizeof(size_t);
size_t *column = malloc(num_bytes);
if (column == NULL) {
return -1.0;
}
for (size_t y = 1; y <= len1; y++) {
column[y] = y;
}
@@ -47,26 +50,26 @@ size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array
size_t dist = column[len1];
free(column);
return dist;
return (ssize_t)dist;
}
ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost) {
if (s1 == NULL || s2 == NULL) return -1;
uint32_array *u1 = unicode_codepoints(s1);
if (u1 == NULL) return -1.0;
uint32_array *u1_array = unicode_codepoints(s1);
if (u1_array == NULL) return -1.0;
uint32_array *u2 = unicode_codepoints(s2);
uint32_array *u2_array = unicode_codepoints(s2);
if (u2 == NULL) {
uint32_array_destroy(u1);
if (u2_array == NULL) {
uint32_array_destroy(u1_array);
return -1.0;
}
ssize_t lev = damerau_levenshtein_distance_unicode(u1, u2, replace_cost);
ssize_t lev = damerau_levenshtein_distance_unicode(u1_array, u2_array, replace_cost);
uint32_array_destroy(u1);
uint32_array_destroy(u2);
uint32_array_destroy(u1_array);
uint32_array_destroy(u2_array);
return lev;
}
@@ -151,27 +154,6 @@ double jaro_distance(const char *s1, const char *s2) {
return -1.0;
}
uint32_array *u1 = unicode_codepoints(s1);
if (u1 == NULL) return -1.0;
uint32_array *u2 = unicode_codepoints(s2);
if (u2 == NULL) {
uint32_array_destroy(u1);
return -1.0;
}
double jaro = jaro_distance_unicode(u1, u2);
uint32_array_destroy(u1);
uint32_array_destroy(u2);
return jaro;
}
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold) {
if (s1 == NULL || s2 == NULL) {
return -1.0;
}
uint32_array *u1_array = unicode_codepoints(s1);
if (u1_array == NULL) return -1.0;
@@ -182,6 +164,13 @@ double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, do
return -1.0;
}
double jaro = jaro_distance_unicode(u1_array, u2_array);
uint32_array_destroy(u1_array);
uint32_array_destroy(u2_array);
return jaro;
}
double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold) {
double jaro = jaro_distance_unicode(u1_array, u2_array);
double j;
@@ -205,12 +194,36 @@ double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, do
jaro_winkler += (1.0 - jaro_winkler) * i * prefix_scale;
}
return jaro_winkler > 1.0 ? 1.0 : jaro_winkler;
}
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold) {
if (s1 == NULL || s2 == NULL) {
return -1.0;
}
uint32_array *u1_array = unicode_codepoints(s1);
if (u1_array == NULL) return -1.0;
uint32_array *u2_array = unicode_codepoints(s2);
if (u2_array == NULL) {
uint32_array_destroy(u1_array);
return -1.0;
}
double jaro_winkler = jaro_winkler_distance_unicode_prefix_threshold(u1_array, u2_array, prefix_scale, bonus_threshold);
uint32_array_destroy(u1_array);
uint32_array_destroy(u2_array);
return jaro_winkler > 1.0 ? 1.0 : jaro_winkler;
return jaro_winkler;
}
inline double jaro_winkler_distance(const char *s1, const char *s2) {
return jaro_winkler_distance_prefix_threshold(s1, s2, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD);
}
inline double jaro_winkler_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) {
return jaro_winkler_distance_unicode_prefix_threshold(u1_array, u2_array, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD);
}

View File

@@ -4,15 +4,21 @@
#include <stdio.h>
#include <stdlib.h>
#include "collections.h"
#define DEFAULT_JARO_WINKLER_PREFIX_SCALE 0.1
#define DEFAULT_JARO_WINKLER_BONUS_THRESHOLD 0.7
ssize_t damerau_levenshtein_distance(const char *s1, const char *s2);
ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost);
ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost);
double jaro_distance(const char *s1, const char *s2);
double jaro_distance_unicode(uint32_array *u1_array, uint32_array *u2_array);
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold);
double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold);
double jaro_winkler_distance(const char *s1, const char *s2);
double jaro_winkler_distance_unicode(uint32_array *u1_array, uint32_array *u2_array);
#endif