580 lines
17 KiB
C
580 lines
17 KiB
C
#include "string_similarity.h"
|
|
#include "string_utils.h"
|
|
|
|
#include <limits.h>
|
|
|
|
static affine_gap_edits_t NULL_AFFINE_GAP_EDITS = {
|
|
.num_matches = 0,
|
|
.num_mismatches = 0,
|
|
.num_transpositions = 0,
|
|
.num_gap_opens = 0,
|
|
.num_gap_extensions = 0
|
|
};
|
|
|
|
typedef enum {
|
|
AFFINE_CHAR_MATCH,
|
|
AFFINE_CHAR_MISMATCH,
|
|
AFFINE_TRANSPOSITION,
|
|
AFFINE_GAP_OPEN,
|
|
AFFINE_GAP_EXTEND
|
|
} affine_gap_op;
|
|
|
|
static inline bool space_or_equivalent(int32_t c) {
|
|
int cat = utf8proc_category(c);
|
|
return utf8_is_whitespace(c) || utf8_is_hyphen(c) || utf8_is_punctuation(cat);
|
|
}
|
|
|
|
affine_gap_edits_t affine_gap_distance_unicode_costs(uint32_array *u1_array, uint32_array *u2_array, size_t start_gap_cost, size_t extend_gap_cost, size_t match_cost, size_t mismatch_cost, size_t transpose_cost) {
|
|
if (u1_array->n < u2_array->n) {
|
|
uint32_array *tmp_array = u1_array;
|
|
u1_array = u2_array;
|
|
u2_array = tmp_array;
|
|
}
|
|
|
|
size_t m = u1_array->n;
|
|
size_t n = u2_array->n;
|
|
|
|
uint32_t *u1 = u1_array->a;
|
|
uint32_t *u2 = u2_array->a;
|
|
|
|
affine_gap_edits_t edits = NULL_AFFINE_GAP_EDITS;
|
|
|
|
if (unicode_equals(u1_array, u2_array)) {
|
|
edits.num_matches = n;
|
|
return edits;
|
|
}
|
|
|
|
size_t num_bytes = (m + 1) * sizeof(size_t);
|
|
|
|
size_t *C = malloc(num_bytes);
|
|
if (C == NULL) {
|
|
return NULL_AFFINE_GAP_EDITS;
|
|
}
|
|
|
|
size_t *D = malloc(num_bytes);
|
|
if (D == NULL) {
|
|
free(C);
|
|
return NULL_AFFINE_GAP_EDITS;
|
|
}
|
|
|
|
affine_gap_edits_t *E = malloc((m + 1) * sizeof(affine_gap_edits_t));
|
|
if (E == NULL) {
|
|
free(C);
|
|
free(D);
|
|
return NULL_AFFINE_GAP_EDITS;
|
|
}
|
|
|
|
affine_gap_edits_t *ED = malloc((m + 1) * sizeof(affine_gap_edits_t));
|
|
if (ED == NULL) {
|
|
free(C);
|
|
free(D);
|
|
free(E);
|
|
return NULL_AFFINE_GAP_EDITS;
|
|
}
|
|
|
|
size_t e = 0, c = 0, s = 0;
|
|
|
|
C[0] = 0;
|
|
E[0] = NULL_AFFINE_GAP_EDITS;
|
|
size_t t = start_gap_cost;
|
|
|
|
affine_gap_edits_t base_edits = NULL_AFFINE_GAP_EDITS;
|
|
base_edits.num_gap_opens++;
|
|
|
|
for (size_t j = 1; j < m + 1; j++) {
|
|
t += extend_gap_cost;
|
|
C[j] = t;
|
|
D[j] = t + start_gap_cost;
|
|
base_edits.num_gap_extensions++;
|
|
E[j] = base_edits;
|
|
ED[j] = base_edits;
|
|
}
|
|
|
|
t = start_gap_cost;
|
|
base_edits = NULL_AFFINE_GAP_EDITS;
|
|
base_edits.num_gap_opens++;
|
|
|
|
affine_gap_edits_t current_edits = NULL_AFFINE_GAP_EDITS;
|
|
affine_gap_edits_t prev_char_edits = NULL_AFFINE_GAP_EDITS;
|
|
affine_gap_edits_t prev_row_prev_char_edits = NULL_AFFINE_GAP_EDITS;
|
|
|
|
bool in_gap = false;
|
|
|
|
for (size_t i = 1; i < n + 1; i++) {
|
|
// s = CC[0]
|
|
s = C[0];
|
|
uint32_t c2 = u2[i - 1];
|
|
// CC[0] = c = t = t + h
|
|
t += extend_gap_cost;
|
|
c = t;
|
|
C[0] = c;
|
|
|
|
prev_row_prev_char_edits = E[0];
|
|
base_edits.num_gap_extensions++;
|
|
prev_char_edits = base_edits;
|
|
E[0] = prev_char_edits;
|
|
|
|
// e = t + g
|
|
e = t + start_gap_cost;
|
|
|
|
affine_gap_op op = AFFINE_GAP_OPEN;
|
|
|
|
ssize_t match_at = -1;
|
|
|
|
size_t min_at = 0;
|
|
size_t min_cost = SIZE_MAX;
|
|
|
|
for (size_t j = 1; j < m + 1; j++) {
|
|
// insertion
|
|
// e = min(e, c + g) + h
|
|
size_t min = e;
|
|
uint32_t c1 = u1[j - 1];
|
|
|
|
affine_gap_op insert_op = AFFINE_GAP_OPEN;
|
|
|
|
if ((c + start_gap_cost) < min) {
|
|
min = c + start_gap_cost;
|
|
insert_op = AFFINE_GAP_OPEN;
|
|
} else {
|
|
insert_op = AFFINE_GAP_EXTEND;
|
|
}
|
|
|
|
e = min + extend_gap_cost;
|
|
|
|
// deletion
|
|
// DD[j] = min(DD[j], CC[j] + g) + h
|
|
|
|
affine_gap_op delete_op = AFFINE_GAP_OPEN;
|
|
|
|
min = D[j];
|
|
affine_gap_edits_t delete_edits = ED[j];
|
|
affine_gap_edits_t delete_edits_stored = delete_edits;
|
|
delete_op = AFFINE_GAP_OPEN;
|
|
if (C[j] + start_gap_cost < min) {
|
|
min = C[j] + start_gap_cost;
|
|
|
|
delete_edits = delete_edits_stored = E[j];
|
|
delete_edits_stored.num_gap_opens++;
|
|
}
|
|
|
|
D[j] = min + extend_gap_cost;
|
|
delete_edits_stored.num_gap_extensions++;
|
|
ED[j] = delete_edits_stored;
|
|
|
|
// Cost
|
|
// c = min(DD[j], e, s + w(a, b))
|
|
|
|
affine_gap_op current_op = delete_op;
|
|
|
|
|
|
min = D[j];
|
|
|
|
// Delete transition
|
|
current_edits = delete_edits;
|
|
|
|
if (e < min) {
|
|
min = e;
|
|
// Insert transition
|
|
current_op = insert_op;
|
|
current_edits = prev_char_edits;
|
|
}
|
|
|
|
bool both_separators = space_or_equivalent((int32_t)c1) && space_or_equivalent((int32_t)c2);
|
|
|
|
bool is_transpose = false;
|
|
size_t w = c1 != c2 && !both_separators ? mismatch_cost : match_cost;
|
|
|
|
if (c1 != c2 && utf8_is_letter(utf8proc_category(c2)) && utf8_is_letter(utf8proc_category(c1)) && i < n && c2 == u1[i] && j < m && c1 == u2[j]) {
|
|
w = transpose_cost;
|
|
is_transpose = true;
|
|
}
|
|
|
|
if (s + w < min) {
|
|
min = s + w;
|
|
|
|
// Match/mismatch/transpose transition
|
|
current_edits = prev_row_prev_char_edits;
|
|
|
|
if ((c1 == c2 || both_separators) && !is_transpose) {
|
|
current_op = AFFINE_CHAR_MATCH;
|
|
} else if (!is_transpose) {
|
|
current_op = AFFINE_CHAR_MISMATCH;
|
|
} else if (is_transpose) {
|
|
current_op = AFFINE_TRANSPOSITION;
|
|
}
|
|
}
|
|
|
|
if (current_op == AFFINE_CHAR_MATCH) {
|
|
current_edits.num_matches++;
|
|
} else if (current_op == AFFINE_CHAR_MISMATCH) {
|
|
current_edits.num_mismatches++;
|
|
} else if (current_op == AFFINE_GAP_EXTEND) {
|
|
current_edits.num_gap_extensions++;
|
|
} else if (current_op == AFFINE_GAP_OPEN) {
|
|
current_edits.num_gap_opens++;
|
|
current_edits.num_gap_extensions++;
|
|
} else if (current_op == AFFINE_TRANSPOSITION) {
|
|
current_edits.num_transpositions++;
|
|
}
|
|
|
|
if (min < min_cost) {
|
|
op = current_op;
|
|
min_cost = min;
|
|
min_at = j;
|
|
}
|
|
|
|
c = min;
|
|
s = C[j];
|
|
C[j] = c;
|
|
|
|
prev_char_edits = current_edits;
|
|
prev_row_prev_char_edits = E[j];
|
|
E[j] = prev_char_edits;
|
|
|
|
// In the case of a transposition, duplicate costs for next character and advance by 2
|
|
if (current_op == AFFINE_TRANSPOSITION) {
|
|
E[j + 1] = E[j];
|
|
C[j + 1] = C[j];
|
|
j++;
|
|
}
|
|
}
|
|
|
|
if (op == AFFINE_TRANSPOSITION) {
|
|
i++;
|
|
}
|
|
|
|
}
|
|
|
|
edits = E[m];
|
|
free(C);
|
|
free(D);
|
|
free(E);
|
|
free(ED);
|
|
|
|
return edits;
|
|
|
|
}
|
|
|
|
affine_gap_edits_t affine_gap_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) {
|
|
return affine_gap_distance_unicode_costs(u1_array, u2_array, DEFAULT_AFFINE_GAP_OPEN_COST, DEFAULT_AFFINE_GAP_EXTEND_COST, DEFAULT_AFFINE_GAP_MATCH_COST, DEFAULT_AFFINE_GAP_MISMATCH_COST, DEFAULT_AFFINE_GAP_TRANSPOSE_COST);
|
|
}
|
|
|
|
affine_gap_edits_t affine_gap_distance_costs(const char *s1, const char *s2, size_t start_gap_cost, size_t extend_gap_cost, size_t match_cost, size_t mismatch_cost, size_t transpose_cost) {
|
|
if (s1 == NULL || s2 == NULL) return NULL_AFFINE_GAP_EDITS;
|
|
|
|
uint32_array *u1_array = unicode_codepoints(s1);
|
|
if (u1_array == NULL) return NULL_AFFINE_GAP_EDITS;
|
|
|
|
uint32_array *u2_array = unicode_codepoints(s2);
|
|
|
|
if (u2_array == NULL) {
|
|
uint32_array_destroy(u1_array);
|
|
return NULL_AFFINE_GAP_EDITS;
|
|
}
|
|
|
|
affine_gap_edits_t edits = affine_gap_distance_unicode_costs(u1_array, u2_array, start_gap_cost, extend_gap_cost, match_cost, mismatch_cost, transpose_cost);
|
|
|
|
uint32_array_destroy(u1_array);
|
|
uint32_array_destroy(u2_array);
|
|
|
|
return edits;
|
|
}
|
|
|
|
|
|
affine_gap_edits_t affine_gap_distance(const char *s1, const char *s2) {
|
|
return affine_gap_distance_costs(s1, s2, DEFAULT_AFFINE_GAP_OPEN_COST, DEFAULT_AFFINE_GAP_EXTEND_COST, DEFAULT_AFFINE_GAP_MATCH_COST, DEFAULT_AFFINE_GAP_MISMATCH_COST, DEFAULT_AFFINE_GAP_TRANSPOSE_COST);
|
|
}
|
|
|
|
|
|
bool possible_abbreviation_unicode_with_edits(uint32_array *u1_array, uint32_array *u2_array, affine_gap_edits_t edits) {
|
|
size_t len1 = u1_array->n;
|
|
size_t len2 = u2_array->n;
|
|
if (len1 == 0 || len2 == 0) return false;
|
|
|
|
size_t min_len = len1 < len2 ? len1 : len2;
|
|
|
|
return edits.num_matches == min_len && u1_array->a[0] == u2_array->a[0];
|
|
}
|
|
|
|
inline bool possible_abbreviation_unicode(uint32_array *u1_array, uint32_array *u2_array) {
|
|
affine_gap_edits_t edits = affine_gap_distance_unicode(u1_array, u2_array);
|
|
|
|
return possible_abbreviation_unicode_with_edits(u1_array, u2_array, edits);
|
|
}
|
|
|
|
|
|
bool possible_abbreviation_unicode_strict(uint32_array *u1_array, uint32_array *u2_array) {
|
|
size_t len1 = u1_array->n;
|
|
size_t len2 = u2_array->n;
|
|
if (len1 == 0 || len2 == 0) return false;
|
|
|
|
size_t min_len = len1 < len2 ? len1 : len2;
|
|
|
|
ssize_t prefix_len = unicode_common_prefix(u1_array, u2_array);
|
|
if (prefix_len == min_len) return true;
|
|
ssize_t suffix_len = unicode_common_suffix(u1_array, u2_array);
|
|
return suffix_len > 0 && prefix_len > 0 && possible_abbreviation_unicode(u1_array, u2_array);
|
|
}
|
|
|
|
static bool possible_abbreviation_options(const char *s1, const char *s2, bool strict) {
|
|
if (s1 == NULL || s2 == NULL) return false;
|
|
|
|
uint32_array *u1_array = unicode_codepoints(s1);
|
|
if (u1_array == NULL) return false;
|
|
|
|
uint32_array *u2_array = unicode_codepoints(s2);
|
|
|
|
if (u2_array == NULL) {
|
|
uint32_array_destroy(u1_array);
|
|
return false;
|
|
}
|
|
|
|
bool abbrev = false;
|
|
if (!strict) {
|
|
abbrev = possible_abbreviation_unicode(u1_array, u2_array);
|
|
} else {
|
|
abbrev = possible_abbreviation_unicode_strict(u1_array, u2_array);
|
|
}
|
|
|
|
uint32_array_destroy(u1_array);
|
|
uint32_array_destroy(u2_array);
|
|
|
|
return abbrev;
|
|
}
|
|
|
|
inline bool possible_abbreviation(const char *s1, const char *s2) {
|
|
return possible_abbreviation_options(s1, s2, false);
|
|
}
|
|
|
|
inline bool possible_abbreviation_strict(const char *s1, const char *s2) {
|
|
return possible_abbreviation_options(s1, s2, true);
|
|
}
|
|
|
|
|
|
ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) {
|
|
size_t len1 = u1_array->n;
|
|
size_t len2 = u2_array->n;
|
|
|
|
uint32_t *u1 = u1_array->a;
|
|
uint32_t *u2 = u2_array->a;
|
|
|
|
size_t num_bytes = (len1 + 1) * sizeof(size_t);
|
|
|
|
size_t *column = malloc(num_bytes);
|
|
if (column == NULL) {
|
|
return -1.0;
|
|
}
|
|
|
|
for (size_t y = 1; y <= len1; y++) {
|
|
column[y] = y;
|
|
}
|
|
|
|
size_t transpose_diag = 0;
|
|
size_t last_diag = 0;
|
|
|
|
for (size_t x = 1; x <= len2; x++) {
|
|
column[0] = x;
|
|
for (size_t y = 1, last_diag = x - 1; y <= len1; y++) {
|
|
size_t old_diag = column[y];
|
|
size_t cost = (u1[y - 1] == u2[x - 1] ? 0 : 1);
|
|
|
|
size_t v1 = column[y] + 1;
|
|
size_t v2 = column[y - 1] + 1;
|
|
size_t v3 = last_diag + cost;
|
|
|
|
size_t min = v1;
|
|
if (v2 < min) min = v2;
|
|
if (v3 < min) min = v3;
|
|
|
|
if (x > 1 && y > 1 && u1[y - 1] == u2[x - 2] && u1[y - 2] == u2[x - 1]) {
|
|
size_t v4 = transpose_diag + cost;
|
|
if (v4 < min) min = v4;
|
|
}
|
|
|
|
column[y] = min;
|
|
|
|
last_diag = old_diag;
|
|
}
|
|
transpose_diag = last_diag;
|
|
}
|
|
|
|
size_t dist = column[len1];
|
|
free(column);
|
|
return (ssize_t)dist;
|
|
}
|
|
|
|
ssize_t damerau_levenshtein_distance_replace_cost(const char *s1, const char *s2, size_t replace_cost) {
|
|
if (s1 == NULL || s2 == NULL) return -1;
|
|
|
|
uint32_array *u1_array = unicode_codepoints(s1);
|
|
if (u1_array == NULL) return -1.0;
|
|
|
|
uint32_array *u2_array = unicode_codepoints(s2);
|
|
|
|
if (u2_array == NULL) {
|
|
uint32_array_destroy(u1_array);
|
|
return -1.0;
|
|
}
|
|
|
|
ssize_t lev = damerau_levenshtein_distance_unicode(u1_array, u2_array, replace_cost);
|
|
|
|
uint32_array_destroy(u1_array);
|
|
uint32_array_destroy(u2_array);
|
|
return lev;
|
|
}
|
|
|
|
ssize_t damerau_levenshtein_distance(const char *s1, const char *s2) {
|
|
return damerau_levenshtein_distance_replace_cost(s1, s2, 0);
|
|
}
|
|
|
|
double jaro_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) {
|
|
if (u1_array == NULL || u2_array == NULL) return -1.0;
|
|
|
|
size_t len1 = u1_array->n;
|
|
size_t len2 = u2_array->n;
|
|
// If both strings are zero-length, return 1. If only one is, return 0
|
|
if (len1 == 0) return len2 == 0 ? 1.0 : 0.0;
|
|
|
|
size_t max_len = len1 > len2 ? len1 : len2;
|
|
size_t match_distance = (max_len / 2) - 1;
|
|
|
|
uint8_t *u1_matches = calloc(len2, sizeof(uint8_t));
|
|
uint8_t *u2_matches = calloc(len1, sizeof(uint8_t));
|
|
|
|
uint32_t *u1 = u1_array->a;
|
|
uint32_t *u2 = u2_array->a;
|
|
|
|
double matches = 0.0;
|
|
double transpositions = 0.0;
|
|
|
|
size_t i = 0;
|
|
|
|
// count matches
|
|
for (size_t i = 0; i < len1; i++) {
|
|
// start and end take into account the match distance
|
|
size_t start = i > match_distance ? i - match_distance : 0;
|
|
size_t end = (i + match_distance + 1) < len2 ? i + match_distance + 1 : len2;
|
|
|
|
for (size_t k = start; k < end; k++) {
|
|
// already a match at k
|
|
if (u2_matches[k]) continue;
|
|
// codepoints not equal
|
|
if (u1[i] != u2[k]) continue;
|
|
// otherwise record a match on both sides and increment counter
|
|
u1_matches[i] = true;
|
|
u2_matches[k] = true;
|
|
matches++;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (matches == 0) {
|
|
free(u1_matches);
|
|
free(u2_matches);
|
|
return 0.0;
|
|
}
|
|
|
|
|
|
// count transpositions
|
|
size_t k = 0;
|
|
for (size_t i = 0; i < len1; i++) {
|
|
// wait for a match in u1
|
|
if (!u1_matches[i]) continue;
|
|
// get the next matched character in u2
|
|
while (!u2_matches[k]) k++;
|
|
// it's a transposition
|
|
if (u1[i] != u2[k]) transpositions++;
|
|
k++;
|
|
}
|
|
|
|
// transpositions double-count transposed characters, so divide by 2
|
|
transpositions /= 2.0;
|
|
|
|
free(u1_matches);
|
|
free(u2_matches);
|
|
|
|
// Jaro distance
|
|
return ((matches / len1) +
|
|
(matches / len2) +
|
|
((matches - transpositions) / matches)) / 3.0;
|
|
}
|
|
|
|
double jaro_distance(const char *s1, const char *s2) {
|
|
if (s1 == NULL || s2 == NULL) {
|
|
return -1.0;
|
|
}
|
|
|
|
uint32_array *u1_array = unicode_codepoints(s1);
|
|
if (u1_array == NULL) return -1.0;
|
|
|
|
uint32_array *u2_array = unicode_codepoints(s2);
|
|
|
|
if (u2_array == NULL) {
|
|
uint32_array_destroy(u1_array);
|
|
return -1.0;
|
|
}
|
|
|
|
double jaro = jaro_distance_unicode(u1_array, u2_array);
|
|
uint32_array_destroy(u1_array);
|
|
uint32_array_destroy(u2_array);
|
|
return jaro;
|
|
}
|
|
|
|
double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, uint32_array *u2_array, double prefix_scale, double bonus_threshold) {
|
|
double jaro = jaro_distance_unicode(u1_array, u2_array);
|
|
|
|
double j;
|
|
|
|
size_t len1 = u1_array->n;
|
|
size_t len2 = u2_array->n;
|
|
|
|
uint32_t *u1 = u1_array->a;
|
|
uint32_t *u2 = u2_array->a;
|
|
|
|
size_t m = len1 < len2 ? len1 : len2;
|
|
|
|
size_t i = 0;
|
|
for (; i < m; i++) {
|
|
if (u1[i] != u2[i]) break;
|
|
}
|
|
|
|
double jaro_winkler = jaro;
|
|
|
|
if (jaro >= bonus_threshold) {
|
|
jaro_winkler += (1.0 - jaro_winkler) * i * prefix_scale;
|
|
}
|
|
|
|
return jaro_winkler > 1.0 ? 1.0 : jaro_winkler;
|
|
}
|
|
|
|
double jaro_winkler_distance_prefix_threshold(const char *s1, const char *s2, double prefix_scale, double bonus_threshold) {
|
|
if (s1 == NULL || s2 == NULL) {
|
|
return -1.0;
|
|
}
|
|
|
|
uint32_array *u1_array = unicode_codepoints(s1);
|
|
if (u1_array == NULL) return -1.0;
|
|
|
|
uint32_array *u2_array = unicode_codepoints(s2);
|
|
|
|
if (u2_array == NULL) {
|
|
uint32_array_destroy(u1_array);
|
|
return -1.0;
|
|
}
|
|
|
|
double jaro_winkler = jaro_winkler_distance_unicode_prefix_threshold(u1_array, u2_array, prefix_scale, bonus_threshold);
|
|
|
|
uint32_array_destroy(u1_array);
|
|
uint32_array_destroy(u2_array);
|
|
|
|
return jaro_winkler;
|
|
}
|
|
|
|
inline double jaro_winkler_distance(const char *s1, const char *s2) {
|
|
return jaro_winkler_distance_prefix_threshold(s1, s2, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD);
|
|
}
|
|
|
|
inline double jaro_winkler_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) {
|
|
return jaro_winkler_distance_unicode_prefix_threshold(u1_array, u2_array, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD);
|
|
}
|