Files
libpostal/src/string_similarity.c

217 lines
5.7 KiB
C

#include "string_similarity.h"
#include "string_utils.h"
size_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) {
size_t len1 = u1_array->n;
size_t len2 = u2_array->n;
uint32_t *u1 = u1_array->a;
uint32_t *u2 = u2_array->a;
size_t num_bytes = (len1 + 1) * sizeof(size_t);
size_t *column = malloc(num_bytes);
for (size_t y = 1; y <= len1; y++) {
column[y] = y;
}
size_t transpose_diag = 0;
size_t last_diag = 0;
for (size_t x = 1; x <= len2; x++) {
column[0] = x;
for (size_t y = 1, last_diag = x - 1; y <= len1; y++) {
size_t old_diag = column[y];
size_t cost = (u1[y - 1] == u2[x - 1] ? 0 : 1);
size_t v1 = column[y] + 1;
size_t v2 = column[y - 1] + 1;
size_t v3 = last_diag + cost;
size_t min = v1;
if (v2 < min) min = v2;
if (v3 < min) min = v3;
if (x > 1 && y > 1 && u1[y - 1] == u2[x - 2] && u1[y - 2] == u2[x - 1]) {
size_t v4 = transpose_diag + cost;
if (v4 < min) min = v4;
}
column[y] = min;
last_diag = old_diag;
}
transpose_diag = last_diag;
}
size_t dist = column[len1];
free(column);
return dist;
}
ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost) {
if (s1 == NULL || s2 == NULL) return -1;
uint32_array *u1 = unicode_codepoints(s1);
if (u1 == NULL) return -1.0;
uint32_array *u2 = unicode_codepoints(s2);
if (u2 == NULL) {
uint32_array_destroy(u1);
return -1.0;
}
ssize_t lev = damerau_levenshtein_distance_unicode(u1, u2, replace_cost);
uint32_array_destroy(u1);
uint32_array_destroy(u2);
return lev;
}
ssize_t damerau_levenshtein_distance(const char *s1, const char *s2) {
return damerau_levenshtein_distance_replace_cost(s1, s2, 0);
}
double jaro_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) {
if (u1_array == NULL || u2_array == NULL) return -1.0;
size_t len1 = u1_array->n;
size_t len2 = u2_array->n;
// If both strings are zero-length, return 1. If only one is, return 0
if (len1 == 0) return len2 == 0 ? 1.0 : 0.0;
size_t max_len = len1 > len2 ? len1 : len2;
size_t match_distance = (max_len / 2) - 1;
uint8_t *u1_matches = calloc(len2, sizeof(uint8_t));
uint8_t *u2_matches = calloc(len1, sizeof(uint8_t));
uint32_t *u1 = u1_array->a;
uint32_t *u2 = u2_array->a;
double matches = 0.0;
double transpositions = 0.0;
size_t i = 0;
// count matches
for (size_t i = 0; i < len1; i++) {
// start and end take into account the match distance
size_t start = i > match_distance ? i - match_distance : 0;
size_t end = (i + match_distance + 1) < len2 ? i + match_distance + 1 : len2;
for (size_t k = start; k < end; k++) {
// already a match at k
if (u2_matches[k]) continue;
// codepoints not equal
if (u1[i] != u2[k]) continue;
// otherwise record a match on both sides and increment counter
u1_matches[i] = true;
u2_matches[k] = true;
matches++;
break;
}
}
if (matches == 0) {
free(u1_matches);
free(u2_matches);
return 0.0;
}
// count transpositions
size_t k = 0;
for (size_t i = 0; i < len1; i++) {
// wait for a match in u1
if (!u1_matches[i]) continue;
// get the next matched character in u2
while (!u2_matches[k]) k++;
// it's a transposition
if (u1[i] != u2[k]) transpositions++;
k++;
}
// transpositions double-count transposed characters, so divide by 2
transpositions /= 2.0;
free(u1_matches);
free(u2_matches);
// Jaro distance
return ((matches / len1) +
(matches / len2) +
((matches - transpositions) / matches)) / 3.0;
}
double jaro_distance(const char *s1, const char *s2) {
if (s1 == NULL || s2 == NULL) {
return -1.0;
}
uint32_array *u1 = unicode_codepoints(s1);
if (u1 == NULL) return -1.0;
uint32_array *u2 = unicode_codepoints(s2);
if (u2 == NULL) {
uint32_array_destroy(u1);
return -1.0;
}
double jaro = jaro_distance_unicode(u1, u2);
uint32_array_destroy(u1);
uint32_array_destroy(u2);
return jaro;
}
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold) {
if (s1 == NULL || s2 == NULL) {
return -1.0;
}
uint32_array *u1_array = unicode_codepoints(s1);
if (u1_array == NULL) return -1.0;
uint32_array *u2_array = unicode_codepoints(s2);
if (u2_array == NULL) {
uint32_array_destroy(u1_array);
return -1.0;
}
double jaro = jaro_distance_unicode(u1_array, u2_array);
double j;
size_t len1 = u1_array->n;
size_t len2 = u2_array->n;
uint32_t *u1 = u1_array->a;
uint32_t *u2 = u2_array->a;
size_t m = len1 < len2 ? len1 : len2;
size_t i = 0;
for (; i < m; i++) {
if (u1[i] != u2[i]) break;
}
double jaro_winkler = jaro;
if (jaro >= bonus_threshold) {
jaro_winkler += (1.0 - jaro_winkler) * i * prefix_scale;
}
uint32_array_destroy(u1_array);
uint32_array_destroy(u2_array);
return jaro_winkler > 1.0 ? 1.0 : jaro_winkler;
}
inline double jaro_winkler_distance(const char *s1, const char *s2) {
return jaro_winkler_distance_prefix_threshold(s1, s2, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD);
}