[similarity] adding a multi-word alignmnet algorithm for streets and names like "de la cruz" vs. "dela cruz" or "Oceanwalk Ter" vs. "Ocean Walk Ter"

2018-02-23 01:22:12 -05:00
parent c5bb9d8daa
commit 2b4e7073c2
2 changed files with 92 additions and 0 deletions
--- a/src/string_similarity.c
+++ b/src/string_similarity.c
@@ -588,3 +588,93 @@ inline double jaro_winkler_distance(const char *s1, const char *s2) {
 inline double jaro_winkler_distance_unicode(uint32_array *u1_array, uint32_array *u2_array) {
    return jaro_winkler_distance_unicode_prefix_threshold(u1_array, u2_array, DEFAULT_JARO_WINKLER_PREFIX_SCALE, DEFAULT_JARO_WINKLER_BONUS_THRESHOLD);
 }
+
+phrase_array *multi_word_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2) {
+    if (s1 == NULL || tokens1 == NULL || s2 == NULL || tokens2 == NULL) {
+        return NULL;
+    }
+
+    size_t len1 = tokens1->n;
+    size_t len2 = tokens2->n;
+    if (len1 == 0 || len2 == 0 || len1 == len2) return NULL;
+
+    if (len1 > len2) {
+        const char *tmp_s = s1;
+        s1 = s2;
+        s2 = tmp_s;
+
+        token_array *tmp_t = tokens1;
+        tokens1 = tokens2;
+        tokens2 = tmp_t;
+
+        size_t tmp_l = len1;
+        len1 = len2;
+        len2 = tmp_l;
+    }
+
+    phrase_array *alignments = NULL;
+
+    token_t *t1 = tokens1->a;
+    token_t *t2 = tokens2->a;
+
+    ssize_t phrase_start = -1;
+    ssize_t phrase_token_pos = -1;
+
+    uint8_t *ptr1 = (uint8_t *)s1;
+    uint8_t *ptr2 = (uint8_t *)s2;
+
+    int32_t c1;
+    ssize_t c1_len;
+
+    for (size_t i = 0; i < len1; i++) {
+        token_t ti = t1[i];
+
+        c1_len = utf8proc_iterate(ptr1 + ti.offset, ti.len, &c1);
+        if (c1_len <= 0 || c1 == 0) {
+            break;
+        }
+
+        if (!(is_word_token(ti.type) || is_numeric_token(ti.type)) || is_ideographic(ti.type)) {
+            phrase_token_pos = -1;
+            continue;
+        }
+
+        size_t ti_pos = 0;
+
+        for (size_t j = 0; j < len2; j++) {
+            token_t tj = t2[j];
+
+            if (utf8_compare_len_case_insensitive(ptr1 + ti.offset + ti_pos, ptr2 + tj.offset, tj.len) == 0) {
+                ti_pos += tj.len;
+                if (phrase_start < 0) {
+                    phrase_start = j;
+                    phrase_token_pos = 0;
+                }
+                phrase_token_pos++;
+            } else {
+                phrase_token_pos = -1;
+                phrase_start = -1;
+                ti_pos = 0;
+                continue;
+            }
+
+            if (ti_pos == ti.len && j - phrase_start > 0) {
+                phrase_t phrase = (phrase_t){phrase_start, j - phrase_start + 1, i};
+                // got alignment
+                if (alignments == NULL) {
+                    alignments = phrase_array_new();
+                }
+
+                phrase_array_push(alignments, phrase);
+
+                ti_pos = 0;
+                phrase_token_pos = -1;
+                phrase_start = -1;
+            }
+        }
+
+    }
+
+    return alignments;
+}
+
--- a/src/string_similarity.h
+++ b/src/string_similarity.h
@@ -5,6 +5,7 @@
 #include <stdlib.h>

 #include "collections.h"
+#include "trie_search.h"

 #define DEFAULT_AFFINE_GAP_OPEN_COST 3
 #define DEFAULT_AFFINE_GAP_EXTEND_COST 2
@@ -43,5 +44,6 @@ double jaro_winkler_distance_unicode_prefix_threshold(uint32_array *u1_array, ui
 double jaro_winkler_distance(const char *s1, const char *s2);
 double jaro_winkler_distance_unicode(uint32_array *u1_array, uint32_array *u2_array);

+phrase_array *multi_word_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2);

 #endif