From b34e5783661990db8ba16ecd1f1ba68cff36c995 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 11 Nov 2017 04:02:28 -0500 Subject: [PATCH] [similarity] using new sequence alignment breakdown by operation to tell if any two words are an abbreviation. The loose variant requires that the alignment covers all characters in the shortest string, which matches things like Services vs. Svc, whereas the strict variant requires that either the shorter string is a prefix of the longer one (Inc and Incorporated) or that the two strings share both a prefix and a suffix (Dept and Department). Both variants require that the strings share at least the first letter in common. --- src/string_similarity.c | 77 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 71 insertions(+), 6 deletions(-) diff --git a/src/string_similarity.c b/src/string_similarity.c index bfdb1f11..806d28ea 100644 --- a/src/string_similarity.c +++ b/src/string_similarity.c @@ -245,13 +245,13 @@ affine_gap_edits_t affine_gap_distance_unicode_costs(uint32_array *u1_array, uin } - affine_gap_edits_t ret = E[m]; + edits = E[m]; free(C); free(D); free(E); free(ED); - return ret; + return edits; } @@ -259,7 +259,7 @@ affine_gap_edits_t affine_gap_distance_unicode(uint32_array *u1_array, uint32_ar return affine_gap_distance_unicode_costs(u1_array, u2_array, DEFAULT_AFFINE_GAP_OPEN_COST, DEFAULT_AFFINE_GAP_EXTEND_COST, DEFAULT_AFFINE_GAP_MATCH_COST, DEFAULT_AFFINE_GAP_MISMATCH_COST, DEFAULT_AFFINE_GAP_TRANSPOSE_COST); } -affine_gap_edits_t affine_gap_distance_costs(char *s1, char *s2, size_t start_gap_cost, size_t extend_gap_cost, size_t match_cost, size_t mismatch_cost, size_t transpose_cost) { +affine_gap_edits_t affine_gap_distance_costs(const char *s1, const char *s2, size_t start_gap_cost, size_t extend_gap_cost, size_t match_cost, size_t mismatch_cost, size_t transpose_cost) { if (s1 == NULL || s2 == NULL) return NULL_AFFINE_GAP_EDITS; uint32_array *u1_array = unicode_codepoints(s1); @@ -272,20 +272,85 @@ affine_gap_edits_t affine_gap_distance_costs(char *s1, char *s2, size_t start_ga return NULL_AFFINE_GAP_EDITS; } - affine_gap_edits_t affine_gap = affine_gap_distance_unicode_costs(u1_array, u2_array, start_gap_cost, extend_gap_cost, match_cost, mismatch_cost, transpose_cost); + affine_gap_edits_t edits = affine_gap_distance_unicode_costs(u1_array, u2_array, start_gap_cost, extend_gap_cost, match_cost, mismatch_cost, transpose_cost); uint32_array_destroy(u1_array); uint32_array_destroy(u2_array); - return affine_gap; + return edits; } -affine_gap_edits_t affine_gap_distance(char *s1, char *s2) { +affine_gap_edits_t affine_gap_distance(const char *s1, const char *s2) { return affine_gap_distance_costs(s1, s2, DEFAULT_AFFINE_GAP_OPEN_COST, DEFAULT_AFFINE_GAP_EXTEND_COST, DEFAULT_AFFINE_GAP_MATCH_COST, DEFAULT_AFFINE_GAP_MISMATCH_COST, DEFAULT_AFFINE_GAP_TRANSPOSE_COST); } +bool possible_abbreviation_unicode_with_edits(uint32_array *u1_array, uint32_array *u2_array, affine_gap_edits_t edits) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + if (len1 == 0 || len2 == 0) return false; + + size_t min_len = len1 < len2 ? len1 : len2; + + return edits.num_matches == min_len && u1_array->a[0] == u2_array->a[0]; +} + +inline bool possible_abbreviation_unicode(uint32_array *u1_array, uint32_array *u2_array) { + affine_gap_edits_t edits = affine_gap_distance_unicode(u1_array, u2_array); + + return possible_abbreviation_unicode_with_edits(u1_array, u2_array, edits); +} + + +bool possible_abbreviation_unicode_strict(uint32_array *u1_array, uint32_array *u2_array) { + size_t len1 = u1_array->n; + size_t len2 = u2_array->n; + if (len1 == 0 || len2 == 0) return false; + + size_t min_len = len1 < len2 ? len1 : len2; + + ssize_t prefix_len = unicode_common_prefix(u1_array, u2_array); + if (prefix_len == min_len) return true; + ssize_t suffix_len = unicode_common_suffix(u1_array, u2_array); + return suffix_len > 0 && prefix_len > 0 && possible_abbreviation_unicode(u1_array, u2_array); +} + +static bool possible_abbreviation_options(const char *s1, const char *s2, bool strict) { + if (s1 == NULL || s2 == NULL) return false; + + uint32_array *u1_array = unicode_codepoints(s1); + if (u1_array == NULL) return false; + + uint32_array *u2_array = unicode_codepoints(s2); + + if (u2_array == NULL) { + uint32_array_destroy(u1_array); + return false; + } + + bool abbrev = false; + if (!strict) { + abbrev = possible_abbreviation_unicode(u1_array, u2_array); + } else { + abbrev = possible_abbreviation_unicode_strict(u1_array, u2_array); + } + + uint32_array_destroy(u1_array); + uint32_array_destroy(u2_array); + + return abbrev; +} + +inline bool possible_abbreviation(const char *s1, const char *s2) { + return possible_abbreviation_options(s1, s2, false); +} + +inline bool possible_abbreviation_strict(const char *s1, const char *s2) { + return possible_abbreviation_options(s1, s2, true); +} + + ssize_t damerau_levenshtein_distance_unicode(uint32_array *u1_array, uint32_array *u2_array, size_t replace_cost) { size_t len1 = u1_array->n; size_t len2 = u2_array->n;