From bd477976d1374f5edcd56e4b27f12c3615b52f9a Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 19 Oct 2017 04:51:28 -0400 Subject: [PATCH] [similarity] string similarity measures for Damerau-Levenshtein and Jaro-Winkler distances. Both operate on unicode points internally for lengths, etc. instead of byte strings and the Levenshtein distance uses only one array instead of needing to store the full matrix of transitions. --- src/string_similarity.c | 216 ++++++++++++++++++++++++++++++++++++++++ src/string_similarity.h | 18 ++++ 2 files changed, 234 insertions(+) create mode 100644 src/string_similarity.c create mode 100644 src/string_similarity.h diff --git a/src/string_similarity.c b/src/string_similarity.c new file mode 100644 index 00000000..9608498b --- /dev/null +++ b/src/string_similarity.c @@ -0,0 +1,216 @@ +#include "string_similarity.h" +#include "string_utils.h" + + +size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + + size_t num_bytes = (len1 + 1) * sizeof(size_t); + + size_t *column = malloc(num_bytes); + for (size_t y = 1; y <= len1; y++) { + column[y] = y; + } + + size_t transpose_diag = 0; + size_t last_diag = 0; + + for (size_t x = 1; x <= len2; x++) { + column[0] = x; + for (size_t y = 1, last_diag = x - 1; y <= len1; y++) { + size_t old_diag = column[y]; + size_t cost = (u1[y - 1] == u2[x - 1] ? 0 : 1); + + size_t v1 = column[y] + 1; + size_t v2 = column[y - 1] + 1; + size_t v3 = last_diag + cost; + + size_t min = v1; + if (v2 < min) min = v2; + if (v3 < min) min = v3; + + if (x > 1 && y > 1 && u1[y - 1] == u2[x - 2] && u1[y - 2] == u2[x - 1]) { + size_t v4 = transpose_diag + cost; + if (v4 < min) min = v4; + } + + column[y] = min; + + last_diag = old_diag; + } + transpose_diag = last_diag; + } + + size_t dist = column[len1]; + free(column); + return dist; +} + +ssize_t damerau_levenshtein_distance_replace_cost(char *s1, char *s2, size_t replace_cost) { + if (s1 == NULL || s2 == NULL) return -1; + + uint32_array *u1 = unicode_codepoints(s1); + if (u1 == NULL) return -1.0; + + uint32_array *u2 = unicode_codepoints(s2); + + if (u2 == NULL) { + uint32_array_destroy(u1); + return -1.0; + } + + ssize_t lev = damerau_levenshtein_distance_unicode(u1, u2, replace_cost); + + uint32_array_destroy(u1); + uint32_array_destroy(u2); + return lev; +} + +ssize_t damerau_levenshtein_distance(char *s1, char *s2) { + return damerau_levenshtein_distance_replace_cost(s1, s2, 0); +} + +double jaro_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) { + if (u1_array == NULL || u2_array == NULL) return -1.0; + + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + // If both strings are zero-length, return 1. If only one is, return 0 + if (len1 == 0) return len2 == 0 ? 1.0 : 0.0; + + size_t max_len = len1 > len2 ? len1 : len2; + size_t match_distance = (max_len / 2) - 1; + + uint8_t *u1_matches = calloc(len2, sizeof(uint8_t)); + uint8_t *u2_matches = calloc(len1, sizeof(uint8_t)); + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + + double matches = 0.0; + double transpositions = 0.0; + + size_t i = 0; + + // count matches + for (size_t i = 0; i < len1; i++) { + // start and end take into account the match distance + size_t start = i > match_distance ? i - match_distance : 0; + size_t end = (i + match_distance + 1) < len2 ? i + match_distance + 1 : len2; + + for (size_t k = start; k < end; k++) { + // already a match at k + if (u2_matches[k]) continue; + // codepoints not equal + if (u1[i] != u2[k]) continue; + // otherwise record a match on both sides and increment counter + u1_matches[i] = true; + u2_matches[k] = true; + matches++; + break; + } + } + + if (matches == 0) { + free(u1_matches); + free(u2_matches); + return 0.0; + } + + + // count transpositions + size_t k = 0; + for (size_t i = 0; i < len1; i++) { + // wait for a match in u1 + if (!u1_matches[i]) continue; + // get the next matched character in u2 + while (!u2_matches[k]) k++; + // it's a transposition + if (u1[i] != u2[k]) transpositions++; + k++; + } + + // transpositions double-count transposed characters, so divide by 2 + transpositions /= 2.0; + + free(u1_matches); + free(u2_matches); + + // Jaro distance + return ((matches / len1) + + (matches / len2) + + ((matches - transpositions) / matches)) / 3.0; +} + +double jaro_distance(const char *s1, const char *s2) { + if (s1 == NULL || s2 == NULL) { + return -1.0; + } + + uint32_array *u1 = unicode_codepoints(s1); + if (u1 == NULL) return -1.0; + + uint32_array *u2 = unicode_codepoints(s2); + + if (u2 == NULL) { + uint32_array_destroy(u1); + return -1.0; + } + + double jaro = jaro_distance_unicode(u1, u2); + uint32_array_destroy(u1); + uint32_array_destroy(u2); + return jaro; +} + +double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold) { + if (s1 == NULL || s2 == NULL) { + return -1.0; + } + + uint32_array *u1_array = unicode_codepoints(s1); + if (u1_array == NULL) return -1.0; + + uint32_array *u2_array = unicode_codepoints(s2); + + if (u2_array == NULL) { + uint32_array_destroy(u1_array); + return -1.0; + } + + double jaro = jaro_distance_unicode(u1_array, u2_array); + + double j; + + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + + uint32_t *u1 = u1_array->a; + uint32_t *u2 = u2_array->a; + + size_t m = len1 < len2 ? len1 : len2; + + size_t i = 0; + for (; i < m; i++) { + if (u1[i] != u2[i]) break; + } + + double jaro_winkler = jaro; + + if (jaro >= bonus_threshold) { + jaro_winkler += (1.0 - jaro_winkler) * i * prefix_scale; + } + + uint32_array_destroy(u1_array); + uint32_array_destroy(u2_array); + + return jaro_winkler > 1.0 ? 1.0 : jaro_winkler; +} + +inline double jaro_winkler_distance(const char *s1, const char *s2) { + return jaro_winkler_distance_prefix_threshold(s1, s2, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD); +} diff --git a/src/string_similarity.h b/src/string_similarity.h new file mode 100644 index 00000000..d5fcf805 --- /dev/null +++ b/src/string_similarity.h @@ -0,0 +1,18 @@ +#ifndef STRING_SIMILARITY_H +#define STRING_SIMILARITY_H + +#include +#include + +#define DEFAULT_JARO_WINKLER_PREFIX_SCALE 0.1 +#define DEFAULT_JARO_WINKLER_BONUS_THRESHOLD 0.7 + +ssize_t damerau_levenshtein_distance(char *s1, char *s2); +ssize_t damerau_levenshtein_distance_replace_cost(char *s1, char *s2, size_t replace_cost); + +double jaro_distance(const char *s1, const char *s2); +double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold); +double jaro_winkler_distance(const char *s1, const char *s2); + + +#endif \ No newline at end of file