[similarity] exposing unicode versions of Damerau-Levenshtein and Jaro-Winkler distances
This commit is contained in:
@@ -1,8 +1,7 @@
|
|||||||
#include "string_similarity.h"
|
#include "string_similarity.h"
|
||||||
#include "string_utils.h"
|
#include "string_utils.h"
|
||||||
|
|
||||||
|
ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) {
|
||||||
size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) {
|
|
||||||
size_t len1 = u1_array->n;
|
size_t len1 = u1_array->n;
|
||||||
size_t len2 = u2_array->n;
|
size_t len2 = u2_array->n;
|
||||||
|
|
||||||
@@ -12,6 +11,10 @@ size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array
|
|||||||
size_t num_bytes = (len1 + 1) * sizeof(size_t);
|
size_t num_bytes = (len1 + 1) * sizeof(size_t);
|
||||||
|
|
||||||
size_t *column = malloc(num_bytes);
|
size_t *column = malloc(num_bytes);
|
||||||
|
if (column == NULL) {
|
||||||
|
return -1.0;
|
||||||
|
}
|
||||||
|
|
||||||
for (size_t y = 1; y <= len1; y++) {
|
for (size_t y = 1; y <= len1; y++) {
|
||||||
column[y] = y;
|
column[y] = y;
|
||||||
}
|
}
|
||||||
@@ -47,26 +50,26 @@ size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array
|
|||||||
|
|
||||||
size_t dist = column[len1];
|
size_t dist = column[len1];
|
||||||
free(column);
|
free(column);
|
||||||
return dist;
|
return (ssize_t)dist;
|
||||||
}
|
}
|
||||||
|
|
||||||
ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost) {
|
ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost) {
|
||||||
if (s1 == NULL || s2 == NULL) return -1;
|
if (s1 == NULL || s2 == NULL) return -1;
|
||||||
|
|
||||||
uint32_array *u1 = unicode_codepoints(s1);
|
uint32_array *u1_array = unicode_codepoints(s1);
|
||||||
if (u1 == NULL) return -1.0;
|
if (u1_array == NULL) return -1.0;
|
||||||
|
|
||||||
uint32_array *u2 = unicode_codepoints(s2);
|
uint32_array *u2_array = unicode_codepoints(s2);
|
||||||
|
|
||||||
if (u2 == NULL) {
|
if (u2_array == NULL) {
|
||||||
uint32_array_destroy(u1);
|
uint32_array_destroy(u1_array);
|
||||||
return -1.0;
|
return -1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
ssize_t lev = damerau_levenshtein_distance_unicode(u1, u2, replace_cost);
|
ssize_t lev = damerau_levenshtein_distance_unicode(u1_array, u2_array, replace_cost);
|
||||||
|
|
||||||
uint32_array_destroy(u1);
|
uint32_array_destroy(u1_array);
|
||||||
uint32_array_destroy(u2);
|
uint32_array_destroy(u2_array);
|
||||||
return lev;
|
return lev;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -151,27 +154,6 @@ double jaro_distance(const char *s1, const char *s2) {
|
|||||||
return -1.0;
|
return -1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_array *u1 = unicode_codepoints(s1);
|
|
||||||
if (u1 == NULL) return -1.0;
|
|
||||||
|
|
||||||
uint32_array *u2 = unicode_codepoints(s2);
|
|
||||||
|
|
||||||
if (u2 == NULL) {
|
|
||||||
uint32_array_destroy(u1);
|
|
||||||
return -1.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
double jaro = jaro_distance_unicode(u1, u2);
|
|
||||||
uint32_array_destroy(u1);
|
|
||||||
uint32_array_destroy(u2);
|
|
||||||
return jaro;
|
|
||||||
}
|
|
||||||
|
|
||||||
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold) {
|
|
||||||
if (s1 == NULL || s2 == NULL) {
|
|
||||||
return -1.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_array *u1_array = unicode_codepoints(s1);
|
uint32_array *u1_array = unicode_codepoints(s1);
|
||||||
if (u1_array == NULL) return -1.0;
|
if (u1_array == NULL) return -1.0;
|
||||||
|
|
||||||
@@ -182,6 +164,13 @@ double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, do
|
|||||||
return -1.0;
|
return -1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double jaro = jaro_distance_unicode(u1_array, u2_array);
|
||||||
|
uint32_array_destroy(u1_array);
|
||||||
|
uint32_array_destroy(u2_array);
|
||||||
|
return jaro;
|
||||||
|
}
|
||||||
|
|
||||||
|
double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold) {
|
||||||
double jaro = jaro_distance_unicode(u1_array, u2_array);
|
double jaro = jaro_distance_unicode(u1_array, u2_array);
|
||||||
|
|
||||||
double j;
|
double j;
|
||||||
@@ -205,12 +194,36 @@ double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, do
|
|||||||
jaro_winkler += (1.0 - jaro_winkler) * i * prefix_scale;
|
jaro_winkler += (1.0 - jaro_winkler) * i * prefix_scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return jaro_winkler > 1.0 ? 1.0 : jaro_winkler;
|
||||||
|
}
|
||||||
|
|
||||||
|
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold) {
|
||||||
|
if (s1 == NULL || s2 == NULL) {
|
||||||
|
return -1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_array *u1_array = unicode_codepoints(s1);
|
||||||
|
if (u1_array == NULL) return -1.0;
|
||||||
|
|
||||||
|
uint32_array *u2_array = unicode_codepoints(s2);
|
||||||
|
|
||||||
|
if (u2_array == NULL) {
|
||||||
|
uint32_array_destroy(u1_array);
|
||||||
|
return -1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double jaro_winkler = jaro_winkler_distance_unicode_prefix_threshold(u1_array, u2_array, prefix_scale, bonus_threshold);
|
||||||
|
|
||||||
uint32_array_destroy(u1_array);
|
uint32_array_destroy(u1_array);
|
||||||
uint32_array_destroy(u2_array);
|
uint32_array_destroy(u2_array);
|
||||||
|
|
||||||
return jaro_winkler > 1.0 ? 1.0 : jaro_winkler;
|
return jaro_winkler;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline double jaro_winkler_distance(const char *s1, const char *s2) {
|
inline double jaro_winkler_distance(const char *s1, const char *s2) {
|
||||||
return jaro_winkler_distance_prefix_threshold(s1, s2, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD);
|
return jaro_winkler_distance_prefix_threshold(s1, s2, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline double jaro_winkler_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) {
|
||||||
|
return jaro_winkler_distance_unicode_prefix_threshold(u1_array, u2_array, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD);
|
||||||
|
}
|
||||||
|
|||||||
@@ -4,15 +4,21 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include "collections.h"
|
||||||
|
|
||||||
#define DEFAULT_JARO_WINKLER_PREFIX_SCALE 0.1
|
#define DEFAULT_JARO_WINKLER_PREFIX_SCALE 0.1
|
||||||
#define DEFAULT_JARO_WINKLER_BONUS_THRESHOLD 0.7
|
#define DEFAULT_JARO_WINKLER_BONUS_THRESHOLD 0.7
|
||||||
|
|
||||||
ssize_t damerau_levenshtein_distance(const char *s1, const char *s2);
|
ssize_t damerau_levenshtein_distance(const char *s1, const char *s2);
|
||||||
|
ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost);
|
||||||
ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost);
|
ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost);
|
||||||
|
|
||||||
double jaro_distance(const char *s1, const char *s2);
|
double jaro_distance(const char *s1, const char *s2);
|
||||||
|
double jaro_distance_unicode(uint32_array *u1_array, uint32_array *u2_array);
|
||||||
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold);
|
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold);
|
||||||
|
double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold);
|
||||||
double jaro_winkler_distance(const char *s1, const char *s2);
|
double jaro_winkler_distance(const char *s1, const char *s2);
|
||||||
|
double jaro_winkler_distance_unicode(uint32_array *u1_array, uint32_array *u2_array);
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user