[similarity] string similarity measures for Damerau-Levenshtein and Jaro-Winkler distances. Both operate on unicode points internally for lengths, etc. instead of byte strings and the Levenshtein distance uses only one array instead of needing to store the full matrix of transitions.

This commit is contained in:
Al
2017-10-19 04:51:28 -04:00
parent 245aa226e0
commit bd477976d1
2 changed files with 234 additions and 0 deletions

216
src/string_similarity.c Normal file
View File

@@ -0,0 +1,216 @@
#include "string_similarity.h"
#include "string_utils.h"
size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) {
size_t len1 = u1_array->n;
size_t len2 = u2_array->n;
uint32_t *u1 = u1_array->a;
uint32_t *u2 = u2_array->a;
size_t num_bytes = (len1 + 1) * sizeof(size_t);
size_t *column = malloc(num_bytes);
for (size_t y = 1; y <= len1; y++) {
column[y] = y;
}
size_t transpose_diag = 0;
size_t last_diag = 0;
for (size_t x = 1; x <= len2; x++) {
column[0] = x;
for (size_t y = 1, last_diag = x - 1; y <= len1; y++) {
size_t old_diag = column[y];
size_t cost = (u1[y - 1] == u2[x - 1] ? 0 : 1);
size_t v1 = column[y] + 1;
size_t v2 = column[y - 1] + 1;
size_t v3 = last_diag + cost;
size_t min = v1;
if (v2 < min) min = v2;
if (v3 < min) min = v3;
if (x > 1 && y > 1 && u1[y - 1] == u2[x - 2] && u1[y - 2] == u2[x - 1]) {
size_t v4 = transpose_diag + cost;
if (v4 < min) min = v4;
}
column[y] = min;
last_diag = old_diag;
}
transpose_diag = last_diag;
}
size_t dist = column[len1];
free(column);
return dist;
}
ssize_t damerau_levenshtein_distance_replace_cost(char *s1, char *s2, size_t replace_cost) {
if (s1 == NULL || s2 == NULL) return -1;
uint32_array *u1 = unicode_codepoints(s1);
if (u1 == NULL) return -1.0;
uint32_array *u2 = unicode_codepoints(s2);
if (u2 == NULL) {
uint32_array_destroy(u1);
return -1.0;
}
ssize_t lev = damerau_levenshtein_distance_unicode(u1, u2, replace_cost);
uint32_array_destroy(u1);
uint32_array_destroy(u2);
return lev;
}
ssize_t damerau_levenshtein_distance(char *s1, char *s2) {
return damerau_levenshtein_distance_replace_cost(s1, s2, 0);
}
double jaro_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) {
if (u1_array == NULL || u2_array == NULL) return -1.0;
size_t len1 = u1_array->n;
size_t len2 = u2_array->n;
// If both strings are zero-length, return 1. If only one is, return 0
if (len1 == 0) return len2 == 0 ? 1.0 : 0.0;
size_t max_len = len1 > len2 ? len1 : len2;
size_t match_distance = (max_len / 2) - 1;
uint8_t *u1_matches = calloc(len2, sizeof(uint8_t));
uint8_t *u2_matches = calloc(len1, sizeof(uint8_t));
uint32_t *u1 = u1_array->a;
uint32_t *u2 = u2_array->a;
double matches = 0.0;
double transpositions = 0.0;
size_t i = 0;
// count matches
for (size_t i = 0; i < len1; i++) {
// start and end take into account the match distance
size_t start = i > match_distance ? i - match_distance : 0;
size_t end = (i + match_distance + 1) < len2 ? i + match_distance + 1 : len2;
for (size_t k = start; k < end; k++) {
// already a match at k
if (u2_matches[k]) continue;
// codepoints not equal
if (u1[i] != u2[k]) continue;
// otherwise record a match on both sides and increment counter
u1_matches[i] = true;
u2_matches[k] = true;
matches++;
break;
}
}
if (matches == 0) {
free(u1_matches);
free(u2_matches);
return 0.0;
}
// count transpositions
size_t k = 0;
for (size_t i = 0; i < len1; i++) {
// wait for a match in u1
if (!u1_matches[i]) continue;
// get the next matched character in u2
while (!u2_matches[k]) k++;
// it's a transposition
if (u1[i] != u2[k]) transpositions++;
k++;
}
// transpositions double-count transposed characters, so divide by 2
transpositions /= 2.0;
free(u1_matches);
free(u2_matches);
// Jaro distance
return ((matches / len1) +
(matches / len2) +
((matches - transpositions) / matches)) / 3.0;
}
double jaro_distance(const char *s1, const char *s2) {
if (s1 == NULL || s2 == NULL) {
return -1.0;
}
uint32_array *u1 = unicode_codepoints(s1);
if (u1 == NULL) return -1.0;
uint32_array *u2 = unicode_codepoints(s2);
if (u2 == NULL) {
uint32_array_destroy(u1);
return -1.0;
}
double jaro = jaro_distance_unicode(u1, u2);
uint32_array_destroy(u1);
uint32_array_destroy(u2);
return jaro;
}
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold) {
if (s1 == NULL || s2 == NULL) {
return -1.0;
}
uint32_array *u1_array = unicode_codepoints(s1);
if (u1_array == NULL) return -1.0;
uint32_array *u2_array = unicode_codepoints(s2);
if (u2_array == NULL) {
uint32_array_destroy(u1_array);
return -1.0;
}
double jaro = jaro_distance_unicode(u1_array, u2_array);
double j;
size_t len1 = u1_array->n;
size_t len2 = u2_array->n;
uint32_t *u1 = u1_array->a;
uint32_t *u2 = u2_array->a;
size_t m = len1 < len2 ? len1 : len2;
size_t i = 0;
for (; i < m; i++) {
if (u1[i] != u2[i]) break;
}
double jaro_winkler = jaro;
if (jaro >= bonus_threshold) {
jaro_winkler += (1.0 - jaro_winkler) * i * prefix_scale;
}
uint32_array_destroy(u1_array);
uint32_array_destroy(u2_array);
return jaro_winkler > 1.0 ? 1.0 : jaro_winkler;
}
inline double jaro_winkler_distance(const char *s1, const char *s2) {
return jaro_winkler_distance_prefix_threshold(s1, s2, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD);
}

18
src/string_similarity.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef STRING_SIMILARITY_H
#define STRING_SIMILARITY_H
#include <stdio.h>
#include <stdlib.h>
#define DEFAULT_JARO_WINKLER_PREFIX_SCALE 0.1
#define DEFAULT_JARO_WINKLER_BONUS_THRESHOLD 0.7
ssize_t damerau_levenshtein_distance(char *s1, char *s2);
ssize_t damerau_levenshtein_distance_replace_cost(char *s1, char *s2, size_t replace_cost);
double jaro_distance(const char *s1, const char *s2);
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold);
double jaro_winkler_distance(const char *s1, const char *s2);
#endif