[similarity] adding basic double metaphone implementation

2017-10-18 03:59:05 -04:00
parent 2f2d3da722
commit 3a3aca8490
2 changed files with 999 additions and 0 deletions
--- a/src/double_metaphone.c
+++ b/src/double_metaphone.c
@@ -0,0 +1,982 @@
+#include <stdarg.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "double_metaphone.h"
+#include "string_utils.h"
+#include "utf8proc/utf8proc.h"
+
+static bool is_vowel(char c) {
+    return (c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U' || c == 'Y');
+}
+
+static char get_char_at(char *str, size_t len, ssize_t idx) {
+    if (idx < 0 || idx >= len) return 0;
+    return str[idx];
+}
+
+static char *get_string_at(char *str, size_t len, ssize_t idx) {
+    if (idx < 0 || idx >= len) return NULL;
+    return str + idx;
+}
+
+static inline bool is_slavo_germanic(char *s) {
+    return strstr(s, "W")
+        || strstr(s, "K")
+        || strstr(s, "CZ")
+        || strstr(s, "WITZ");
+}
+
+static inline bool substring_equals(char *str, size_t len, ssize_t index, size_t substr_len, size_t nargs, ...) {
+    char *string_at_index = get_string_at(str, len, index);
+    if (string_at_index == NULL) return false;
+
+    va_list args;
+    char *sub;
+
+    va_start(args, nargs);
+
+    bool matched = false;
+
+    for (size_t i = 0; i < nargs; i++) {
+        sub = va_arg(args, char *);
+        if (sub == NULL) break;
+
+        if (utf8_compare_len(string_at_index, sub, substr_len) == 0) {
+            matched = true;
+            break;
+        }
+    }
+
+    va_end(args);
+
+    return matched;
+
+}
+
+
+double_metaphone_codes_t *double_metaphone(char *input) {
+    if (input == NULL) return NULL;
+
+    char *ptr = utf8_upper(input);
+
+    char *normalized = (char *)utf8proc_NFD((utf8proc_uint8_t *)ptr);
+
+    if (normalized != NULL) {
+        free(ptr);
+        ptr = normalized;        
+    }
+
+    if (ptr == NULL) {
+        return NULL;
+    }
+
+    char *str = ptr;
+
+    size_t len = strlen(str);
+    char_array *primary = char_array_new_size(len + 1);
+    char_array *secondary = char_array_new_size(len + 1);
+
+    bool slavo_germanic = is_slavo_germanic(str);
+
+    size_t current = 0;
+    size_t last = len - 1;
+
+    if (substring_equals(str, len, current, 2, 1, "ʻ")) {
+        str += 2;
+    } else if (get_char_at(str, len, current) == '\'') {
+        str++;
+    }
+
+    if (substring_equals(str, len, current, 2, 5, "GN", "KN", "PN", "WR", "PS")) {
+        current++;
+    } else if (get_char_at(str, len, current) == 'X') {
+        char_array_append(primary, "S");
+        char_array_append(secondary, "S");
+        current++;
+    }
+
+    while (true) {
+        char c = *(str + current);
+        if (c == '\x00') break;
+
+        if (is_vowel(c) && current == 0) {
+            char_array_append(primary, "A");
+            char_array_append(secondary, "A");
+            current++;
+            continue;
+        } else if (c == 'B') {
+            /* "-mb", e.g", "dumb", already skipped over... */
+            char_array_append(primary, "P");
+            char_array_append(secondary, "P");
+
+            if (get_char_at(str, len, current + 1) == 'B') {
+                current += 2;
+            } else {
+                current++;
+            }
+            continue;
+        // C with cedilla (denormalized)
+        } else if (substring_equals(str, len, current, 3, 2, "C\xcc\xa7", "Ç")) { 
+            char_array_append(primary, "S");
+            char_array_append(secondary, "S");
+            current += 2;
+        } else if (c == 'C') {
+            // various germanic
+            if ((current > 1)
+                && !is_vowel(get_char_at(str, len, current - 2))
+                && substring_equals(str, len, current - 1, 3, 1, "ACH")
+                && ((get_char_at(str, len, current + 2) != 'I')
+                    && ((get_char_at(str, len, current + 2) != 'E')
+                        || substring_equals(str, len, current - 2, 6, 2, "BACHER", "MACHER"))
+                   )
+                ) 
+            {
+                char_array_append(primary, "K");
+                char_array_append(secondary, "K");
+                current += 2;
+                continue;
+            }
+
+            // special case for "caesar"
+            if ((current == 0)
+                && substring_equals(str, len, current, 6, 1, "CAESAR"))
+            {
+                char_array_append(primary, "S");
+                char_array_append(secondary, "K");
+                current += 2;
+                continue;
+            }
+
+            // Italian e.g. "chianti"
+            if (substring_equals(str, len, current, 4, 1, "CHIA")) {
+                char_array_append(primary, "K");
+                char_array_append(secondary, "K");
+                current += 2;
+                continue;
+            }
+
+            if (substring_equals(str, len, current, 2, 1, "CH")) {
+                // "michael"
+                if ((current > 0)
+                    && substring_equals(str, len, current, 4, 1, "CHAE"))
+                {
+                    char_array_append(primary, "K");
+                    char_array_append(secondary, "K");
+                    current += 2;
+                    continue;
+                }
+
+                // Greek roots e.g. "chemistry", "chorus"
+                if ((current == 0)
+                    && (substring_equals(str, len, current + 1, 5, 3, "HARAC", "HARIS", "HOREO")
+                     || substring_equals(str, len, current + 1, 4, 3, "HIRO", "HAOS", "HAOT")
+                     || substring_equals(str, len, current + 1, 3, 5, "HOR", "HYM", "HIA", "HEM", "HIM"))
+                    )
+                {
+                    char_array_append(primary, "K");
+                    char_array_append(secondary, "K");
+                    current += 2;
+                    continue;
+                }
+
+                // Germanic, Greek, or otherwise "ch" for "kh" sound
+                if (
+                      (substring_equals(str, len, 0, 4, 2, "VAN ", "VON ")
+                       || substring_equals(str, len, current - 5, 5, 2, " VAN ", " VON ")
+                       || substring_equals(str, len, 0, 3, 1, "SCH"))
+                      // "ochestra", "orchid", "architect" but not "arch"
+                      || substring_equals(str, len, current - 2, 6, 1, "ORCHES", "ARCHIT", "ORCHID")
+                      || substring_equals(str, len, current + 2, 1, 2, "T", "S")
+                      || (
+                            ((current == 0) || substring_equals(str, len, current - 1, 1, 4, "A", "O", "U", "E"))
+                            // e.g. "wachtler", "wechsler", but not "tichner"
+                            && substring_equals(str, len, current + 2, 1, 10, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ")
+                         )
+                   )
+                {
+                    char_array_append(primary, "K");
+                    char_array_append(secondary, "K");
+                    current += 2;
+                    continue;
+                } else {
+                    if (current > 0) {
+                        if (substring_equals(str, len, 0, 2, 1, "MC")) {
+                            char_array_append(primary, "K");
+                            char_array_append(secondary, "K");
+                        } else {
+                            char_array_append(primary, "X");
+                            char_array_append(secondary, "K");
+                        }
+                    } else {
+                        char_array_append(primary, "X");
+                        char_array_append(secondary, "K");
+                    }
+                }
+                current += 2;
+                continue;
+            }
+
+            // e.g, "czerny"
+            if (substring_equals(str, len, current, 2, 1, "CZ")
+                && !substring_equals(str, len, current - 2, 4, 1, "WICZ"))
+            {
+                char_array_append(primary, "S");
+                char_array_append(secondary, "X");
+                current += 2;
+                continue;
+            }
+
+            // e.g. "focaccia"
+            if (substring_equals(str, len, current + 1, 3, 1, "CIA")) {
+                char_array_append(primary, "X");
+                char_array_append(secondary, "X");
+                current += 3;
+                continue;              
+            }
+
+            // double 'C' but not if e.g. "McClellan"
+            if (substring_equals(str, len, current, 2, 1, "CC")
+                && !((current == 1) && get_char_at(str, len, 0) == 'M'))
+            {
+                // "bellocchio" but not "bacchus"
+                if (substring_equals(str, len, current + 2, 1, 3, "I", "E", "H")
+                    && !substring_equals(str, len, current + 2, 2, 1, "HU"))
+                {
+                    // "accident", "accede", "succeed"
+                    if (((current == 1)
+                         && (get_char_at(str, len, current - 1) == 'A'))
+                        || substring_equals(str, len, current - 1, 5, 2, "UCCEE", "UCCES"))
+                    {
+                        char_array_append(primary, "KS");
+                        char_array_append(secondary, "KS");
+                    } else {
+                        char_array_append(primary, "X");
+                        char_array_append(secondary, "X");
+                    }
+                    current += 3;
+                    continue;
+                }
+            } else {
+                char_array_append(primary, "K");
+                char_array_append(secondary, "K");
+                current += 2;
+                continue;
+            }
+
+            if (substring_equals(str, len, current, 2, 3, "CK", "CG", "CQ")) {
+                char_array_append(primary, "K");
+                char_array_append(secondary, "K");
+                current += 2;
+                continue;
+            }
+
+            if (substring_equals(str, len, current, 2, 3, "CI", "CE", "CY")) {
+                if (substring_equals(str, len, current, 3, 3, "CIO", "CIE", "CIA")) {
+                    char_array_append(primary, "S");
+                    char_array_append(secondary, "X");         
+                } else {
+                    char_array_append(primary, "S");
+                    char_array_append(secondary, "S");                    
+                }
+                current += 2;
+                continue;
+            }
+
+            // else
+            char_array_append(primary, "K");
+            char_array_append(secondary, "K");
+
+            if (substring_equals(str, len, current + 1, 2, 3, " C", " Q", " G")) {
+                current += 3;
+            } else if (substring_equals(str, len, current + 1, 1, 3, "C", "K", "Q")
+                   && !substring_equals(str, len, current + 1, 2, 2, "CE", "CI"))
+            {
+                current += 2;
+            } else {
+                current++;
+            }
+
+            continue;
+        } else if (substring_equals(str, len, current, 2, 1, "Đ")) {
+            char_array_append(primary, "T");
+            char_array_append(secondary, "T");
+            current += 2;
+            continue;
+        } else if (c == 'D') {
+            if (substring_equals(str, len, current, 2, 1, "DG")) {
+                if (substring_equals(str, len, current + 2, 1, 3, "I", "E", "Y")) {
+                    // e.g. "edge"
+                    char_array_append(primary, "J");
+                    char_array_append(secondary, "J");
+                    current += 3;
+                    continue;
+                } else {
+                    char_array_append(primary, "TK");
+                    char_array_append(secondary, "TK");
+                    current += 2;
+                    continue;
+                }
+            }
+
+            if (substring_equals(str, len, current, 2, 2, "DT", "DD")) {
+                char_array_append(primary, "T");
+                char_array_append(secondary, "T");
+                current += 2;
+                continue;
+            }
+
+            // else
+            char_array_append(primary, "T");
+            char_array_append(secondary, "T");
+            current++;
+            continue;
+        } else if (c == 'F') {
+            if (get_char_at(str, len, current + 1) == 'F') {
+                current += 2;
+            } else {
+                current++;
+            }
+
+            char_array_append(primary, "F");
+            char_array_append(secondary, "F");
+            continue;
+        } else if (c == 'G') {
+            if (get_char_at(str, len, current + 1) == 'H') {
+                if ((current > 0) && !is_vowel(get_char_at(str, len, current - 1))) {
+                    char_array_append(primary, "K");
+                    char_array_append(secondary, "K");
+                    current += 2;
+                    continue;
+                }
+
+                if (current < 3) {
+                    // "ghislane", "ghiradelli"
+                    if (get_char_at(str, len, current + 2) == 'I') {
+                        char_array_append(primary, "J");
+                        char_array_append(secondary, "J");
+                    } else {
+                        char_array_append(primary, "K");
+                        char_array_append(secondary, "K");
+                    }
+                    current += 2;
+                    continue;
+                }
+
+                // Parker's rule (with some further refinements) - e.g. "hugh"
+                if (
+                    ((current > 1)
+                     && substring_equals(str, len, current - 2, 1, 3, "B", "H", "D"))
+                    // e.g. "bough"
+                    || ((current > 2)
+                        && substring_equals(str, len, current - 3, 1, 3, "B", "H", "D"))
+                    // e.g. "broughton"
+                    || ((current > 3)
+                        && substring_equals(str, len, current - 4, 1, 2, "B", "H"))
+                    )
+                {
+                    current += 2;
+                    continue;
+                } else {
+                    // e.g. "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
+                    if ((current > 2)
+                        && (get_char_at(str, len, current - 1) == 'U')
+                        && substring_equals(str, len, current - 3, 1, 5, "C", "G", "L", "R", "T"))
+                    {
+                        char_array_append(primary, "F");
+                        char_array_append(secondary, "F");
+                    } else if ((current > 0)
+                               && get_char_at(str, len, current - 1) == 'I')
+                    {
+                        char_array_append(primary, "K");
+                        char_array_append(secondary, "K");
+                    }
+                    current += 2;
+                    continue;
+                }
+
+            }
+
+            if (get_char_at(str, len, current + 1) == 'N') {
+                if ((current == 1) && is_vowel(get_char_at(str, len, 0))
+                    && slavo_germanic)
+                {
+                    char_array_append(primary, "KN");
+                    char_array_append(secondary, "N");
+                // not e.g. "cagney"
+                } else if (!substring_equals(str, len, current + 2, 2, 1, "EY")
+                        && (get_char_at(str, len, current + 1) != 'Y')
+                        && !slavo_germanic)
+                {
+                    char_array_append(primary, "N");
+                    char_array_append(secondary, "KN");
+                } else {
+                    char_array_append(primary, "KN");
+                    char_array_append(secondary, "KN");
+                }
+                current += 2;
+                continue;
+            }
+
+            // "tagliaro"
+            if (substring_equals(str, len, current + 1, 2, 1, "LI")
+                && !slavo_germanic)
+            {
+                char_array_append(primary, "KL");
+                char_array_append(secondary, "L");
+                current += 2;
+                continue;
+            }
+
+            // -ges-, -gep-, -gel-, -gie- at beginning
+            if ((current == 0)
+                && ((get_char_at(str, len, current + 1) == 'Y')
+                    || substring_equals(str, len, current + 1, 2, 13, "ES", "EP",
+                                        "EB", "EL", "EY", "IB", "IL", "IN", "IE",
+                                        "EI", "ER")))
+            {
+                char_array_append(primary, "K");
+                char_array_append(secondary, "J");
+                current += 2;
+                continue;
+            }
+
+            // -ger-, -gy-
+            if (
+                (substring_equals(str, len, current + 1, 2, 1, "ER")
+                 || (get_char_at(str, len, current + 1) == 'Y'))
+                && !substring_equals(str, len, 0, 6, 3, "DANGER", "RANGER", "MANGER")
+                && !substring_equals(str, len, current - 1, 1, 2, "E", "I")
+                && !substring_equals(str, len, current - 1, 3, 2, "RGY", "OGY")
+                )
+            {
+                char_array_append(primary, "K");
+                char_array_append(secondary, "J");
+                current += 2;
+                continue;
+            }
+
+            // italian e.g. "viaggi"
+            if (substring_equals(str, len, current + 1, 1, 3, "E", "I", "Y")
+                || substring_equals(str, len, current - 1, 4, 2, "AGGI", "OGGI"))
+            {
+                // obvious germanic
+                if (
+                    (substring_equals(str, len, 0, 4, 2, "VAN ", "VON ")
+                     || substring_equals(str, len, current - 5, 5, 2, " VAN ", " VON ")
+                     || substring_equals(str, len, 0, 3, 1, "SCH"))
+                    || substring_equals(str, len, current + 1, 2, 1, "ET"))
+                {
+                    char_array_append(primary, "K");
+                    char_array_append(secondary, "K");
+                } else {
+                    if (substring_equals(str, len, current + 1, 4, 1, "IER ")
+                        || ((current == len - 3) && substring_equals(str, len, current + 1, 3, 1, "IER"))) 
+                    {
+                        char_array_append(primary, "J");
+                        char_array_append(secondary, "J");
+                    } else {
+                        char_array_append(primary, "J");
+                        char_array_append(secondary, "K");
+                    }
+                    current += 2;
+                    continue;
+                }
+            }
+
+            if (get_char_at(str, len, current + 1) == 'G') {
+                current += 2;
+            } else {
+                current++;
+            }
+
+            char_array_append(primary, "K");
+            char_array_append(secondary, "K");
+            continue;
+        } else if (c == 'H') {
+            // only keep if first & before vowel or between 2 vowels
+            if (((current == 0) || is_vowel(get_char_at(str, len, current - 1)))
+                && is_vowel(get_char_at(str, len, current + 1)))
+            {
+                char_array_append(primary, "H");
+                char_array_append(secondary, "H");
+                current += 2;
+            // also takes care of "HH"
+            } else {
+                current++;
+            }
+            continue;
+        } else if (c == 'J') {
+            // obvious Spanish, "Jose", "San Jacinto"
+            if (substring_equals(str, len, current, 4, 1, "JOSE")
+                || substring_equals(str, len, current, 5, 1, "JOSÉ")
+                || substring_equals(str, len, 0, 4, 1, "SAN "))
+            {
+                if (((current == 0)
+                     && (get_char_at(str, len, current + 4) == ' '))
+                    || substring_equals(str, len, 0, 4, 1, "SAN "))
+                {
+                    char_array_append(primary, "H");
+                    char_array_append(secondary, "H");
+                } else {
+                    char_array_append(primary, "J");
+                    char_array_append(secondary, "H");                    
+                }
+
+                current++;
+                continue;
+            }
+
+            if ((current == 0)
+                && !substring_equals(str, len, current, 4, 1, "JOSE")
+                && !substring_equals(str, len, current, 5, 1, "JOSÉ"))
+            {
+                // Yankelovich/Jankelowicz
+                char_array_append(primary, "J");
+                char_array_append(secondary, "A");
+                current++;
+                continue;
+            } else {
+                // Spanish pronoun of e.g. "bajador"
+                if (is_vowel(get_char_at(str, len, current - 1))
+                    && !slavo_germanic
+                    && ((get_char_at(str, len, current + 1) == 'A')
+                        || (get_char_at(str, len, current + 1) == 'O')))
+                {
+                    char_array_append(primary, "J");
+                    char_array_append(secondary, "H");
+                } else {
+                    if (current == last) {
+                        char_array_append(primary, "J");
+                    } else {
+                        if (!substring_equals(str, len, current + 1, 1, 8, "L", "T",
+                                              "K", "S", "N", "M", "B", "Z")
+                            && !substring_equals(str, len, current - 1, 1, 3, "S", "K", "L"))
+                        {
+                            char_array_append(primary, "J");
+                            char_array_append(secondary, "J");
+                        }
+                    }
+                }
+
+
+                // it could happen!
+                if (get_char_at(str, len, current + 1) == 'J') {
+                    current += 2;
+                } else {
+                    current++;
+                }
+                continue;
+            }
+        } else if (c == 'K') {
+            if (get_char_at(str, len, current + 1) == 'K') {
+                current += 2;
+            } else {
+                current++;
+            }
+
+            char_array_append(primary, "K");
+            char_array_append(secondary, "K");
+            continue;
+        } else if (substring_equals(str, len, current, 2, 1, "Ł")) {
+            current += 2;
+            char_array_append(primary, "L");
+            char_array_append(secondary, "L");
+            continue;
+        } else if (c == 'L') {
+            if (get_char_at(str, len, current + 1) == 'L') {
+                // Spanish e.g. "Cabrillo", "Gallegos"
+                if (((current == (len - 3))
+                     && substring_equals(str, len, current - 1, 4, 3, "ILLO", "ILLA", "ALLE"))
+                    || ((substring_equals(str, len, last - 1, 2, 2, "AS", "OS")
+                      || substring_equals(str, len, last, 1, 2, "A", "O"))
+                      && substring_equals(str, len, current - 1, 4, 1, "ALLE")
+                      )
+                    )
+                {
+                    char_array_append(primary, "L");
+                    current += 2;
+                    continue;
+                }
+
+                current += 2;
+            } else {
+                current++;
+            }
+            char_array_append(primary, "L");
+            char_array_append(secondary, "L");
+            continue;
+        } else if (c == 'M') {
+            if ((substring_equals(str, len, current - 1, 3, 1, "UMB")
+                && (((current + 1) == last)
+                    || substring_equals(str, len, current + 2, 2, 1, "ER")))
+                || (get_char_at(str, len, current + 1) == 'M'))
+            {
+                current += 2;
+            } else {
+                current++;
+            }
+            char_array_append(primary, "M");
+            char_array_append(secondary, "M");
+            continue;
+        } else if (c == 'N') {
+            if (get_char_at(str, len, current + 1) == 'N')  {
+                current += 2;
+            } else {
+                current++;
+            }
+
+            char_array_append(primary, "N");
+            char_array_append(secondary, "N");
+            continue;
+        } else if (substring_equals(str, len, current, 2, 1, "Ñ")) {
+            current += 2;
+            char_array_append(primary, "N");
+            char_array_append(secondary, "N");
+            continue;
+        } else if (c == 'P') {
+            if (get_char_at(str, len, current + 1) == 'H') {
+                char_array_append(primary, "F");
+                char_array_append(secondary, "F");
+                current += 2;
+                continue;
+            }
+
+            // also account for "Campbell", "raspberry"
+            if (substring_equals(str, len, current + 1, 1, 2, "P", "B")) {
+                current += 2;
+            } else {
+                current++;
+            }
+
+            char_array_append(primary, "P");
+            char_array_append(secondary, "P");
+            continue;
+        } else if (c == 'Q') {
+            if (get_char_at(str, len, current + 1) == 'Q') {
+                current += 2;
+            } else {
+                current += 1;
+            }
+
+            char_array_append(primary, "K");
+            char_array_append(secondary, "K");
+            continue;
+        } else if (c == 'R') {
+            // french e.g. "rogier", but exclude "hochmeier"
+            if ((current == last)
+                && !slavo_germanic
+                && substring_equals(str, len, current - 2, 2, 1, "IE")
+                && !substring_equals(str, len, current - 4, 2, 2, "ME", "MA"))
+            {
+                char_array_append(secondary, "R");
+            } else {
+                char_array_append(primary, "R");
+                char_array_append(secondary, "R");
+            }
+
+            if (get_char_at(str, len, current + 1) == 'R') {
+                current += 2;
+            } else {
+                current++;
+            }
+            continue;
+        } else if (c == 'S') {
+            // special cases "island", "isle", "carlisle", "carlysle"
+            if (substring_equals(str, len, current - 1, 3, 2, "ISL", "YSL")) {
+                current++;
+                continue;
+            }
+
+            // special case "sugar-"
+            if ((current == 0)
+                && substring_equals(str, len, current, 5, 1, "SUGAR"))
+            {
+                char_array_append(primary, "X");
+                char_array_append(secondary, "S");
+                current++;
+                continue;
+            }
+
+            if (substring_equals(str, len, current, 2, 1, "SH")) {
+                // Germanic
+                if (substring_equals(str, len, current + 1, 4, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) {
+                    char_array_append(primary, "S");
+                    char_array_append(secondary, "S");
+                } else {
+                    char_array_append(primary, "X");
+                    char_array_append(secondary, "X");
+                }
+                current += 2;
+                continue;
+            }
+
+            // Italian & Armenian
+            if (substring_equals(str, len, current, 3, 2, "SIO", "SIA")
+                || substring_equals(str, len, current, 4, 1, "SIAN"))
+            {
+                if (!slavo_germanic) {
+                    char_array_append(primary, "S");
+                    char_array_append(secondary, "X");
+                } else {
+                    char_array_append(primary, "S");
+                    char_array_append(secondary, "S");
+                }
+                current += 3;
+                continue;
+            }
+
+            /* German & Anglicisations, e.g. "Smith" match "Schmidt", "Snider" match "Schneider"
+               also, -sz- in Slavic language although in Hungarian it is pronounced 's' */
+            if (((current == 0)
+                 && substring_equals(str, len, current + 1, 1, 4, "M", "N", "L", "W"))
+                || substring_equals(str, len, current + 1, 1, 1, "Z"))
+            {
+                char_array_append(primary, "S");
+                char_array_append(secondary, "X");
+                if (substring_equals(str, len, current + 1, 1, 1, "Z")) {
+                    current += 2;
+                } else {
+                    current++;
+                }
+                continue;
+            }
+
+
+            if (substring_equals(str, len, current, 2, 1, "SC")) {
+                // Schlesinger's rule
+                if (get_char_at(str, len, current + 2) == 'H') {
+                    // Dutch origin e.g. "school", "schooner"
+                    if (substring_equals(str, len, current + 3, 2, 6, "OO", "ER", "EN",
+                                         "UY", "ED", "EM"))
+                    {
+                        // "Schermerhorn", "Schenker"
+                        if (substring_equals(str, len, current + 3, 2, 2, "ER", "EN")) {
+                            char_array_append(primary, "X");
+                            char_array_append(secondary, "SK");
+                        } else {
+                            char_array_append(primary, "SK");
+                            char_array_append(secondary, "SK");
+                        }
+                        current += 3;
+                        continue;
+                    } else {
+                        if ((current == 0) && !is_vowel(get_char_at(str, len, 3))
+                            && (get_char_at(str, len, 3) != 'W'))
+                        {
+                            char_array_append(primary, "X");
+                            char_array_append(secondary, "S");
+                        } else {
+                            char_array_append(primary, "X");
+                            char_array_append(secondary, "X");
+                        }
+                        current += 3;
+                        continue;
+                    }
+
+                    if (substring_equals(str, len, current + 2, 1, 3, "I", "E", "Y")) {
+                        char_array_append(primary, "S");
+                        char_array_append(secondary, "S");
+                        current += 3;
+                        continue;
+                    }
+
+                    char_array_append(primary, "SK");
+                    char_array_append(secondary, "SK");
+                    current += 3;
+                    continue;
+                }
+            }
+
+            // French e.g. "resnais", "artois"
+            if ((current == last)
+                && substring_equals(str, len, current - 2, 2, 2, "AI", "OI"))
+            {
+                char_array_append(secondary, "S");
+            } else {
+                char_array_append(primary, "S");
+                char_array_append(secondary, "S");
+            }
+
+            if (substring_equals(str, len, current - 1, 1, 2, "S", "Z")) {
+                current += 2;
+            } else {
+                current++;
+            }
+            continue;
+        } else if (c == 'T') {
+
+            if (substring_equals(str, len, current, 4, 1, "TION")) {
+                char_array_append(primary, "X");
+                char_array_append(secondary, "X");
+                current += 3;
+                continue;
+            }
+
+            if (substring_equals(str, len, current, 3, 2, "TIA", "TCH")) {
+                char_array_append(primary, "X");
+                char_array_append(secondary, "X");
+                current += 3;
+                continue;
+            }
+
+            if (substring_equals(str, len, current, 2, 1, "TH")
+                || substring_equals(str, len, current, 3, 1, "TTH")) 
+            {
+                // special case "Thomas", "Thames", or Germanic
+                if (substring_equals(str, len, current + 2, 2, 2, "OM", "AM")
+                 || substring_equals(str, len, 0, 4, 2, "VAN ", "VON ")
+                 || substring_equals(str, len, current - 5, 5, 2, " VAN ", " VON ")
+                 || substring_equals(str, len, 0, 3, 1, "SCH"))
+                {
+                    char_array_append(primary, "T");
+                    char_array_append(secondary, "T");
+                } else {
+                    // yes, zero
+                    char_array_append(primary, "0");
+                    char_array_append(secondary, "T");
+                }
+
+                current += 2;
+                continue;
+            }
+
+            if (substring_equals(str, len, current + 1, 1, 2, "T", "D")) {
+                current += 2;
+            } else {
+                current++;
+            }
+
+            char_array_append(primary, "T");
+            char_array_append(secondary, "T");
+            continue;
+        } else if (c == 'V') {
+            if (get_char_at(str, len, current + 1) == 'V') {
+                current += 2;
+            } else {
+                current++;
+            }
+
+            char_array_append(primary, "F");
+            char_array_append(secondary, "F");
+            continue;
+        } else if (c == 'W') {
+            // can also be in the middle of word
+            if (substring_equals(str, len, current, 2, 1, "WR")) {
+                char_array_append(primary, "R");
+                char_array_append(secondary, "R");
+                current += 2;
+                continue;
+            }
+
+            if ((current == 0)
+                && (is_vowel(get_char_at(str, len, current + 1))
+                || substring_equals(str, len, current, 2, 1, "WH")))
+            {
+                // Wasserman should match Vasserman
+                if (is_vowel(get_char_at(str, len, current + 1))) {
+                    char_array_append(primary, "A");
+                    char_array_append(secondary, "F");
+                } else {
+                    // need Uomo to match Womo
+                    char_array_append(primary, "A");
+                    char_array_append(secondary, "A");
+                }
+            }
+
+            // Arnow should match Arnoff
+            if (((current == last) && is_vowel(get_char_at(str, len, current - 1)))
+                || substring_equals(str, len, current - 1, 5, 4, "EWSKI", "EWSKY",
+                                    "OWSKI", "OWSKY")
+                || substring_equals(str, len, 0, 3, 1, "SCH"))
+            {
+                char_array_append(secondary, "F");
+                current++;
+                continue;
+            }
+
+            // Polish e.g. "Filipowicz"
+            if (substring_equals(str, len, current, 4, 2, "WICZ", "WITZ")) {
+                char_array_append(primary, "TS");
+                char_array_append(secondary, "FX");
+                current += 4;
+                continue;
+            }
+
+            // else skip it
+            current++;
+            continue;
+        } else if (c == 'X') {
+            // French e.g. "breaux"
+            if (!((current == last)
+                  && (substring_equals(str, len, current - 3, 3, 2, "IAU", "EAU")
+                   || substring_equals(str, len, current - 2, 2, 2, "AU", "OU"))))
+            {
+                char_array_append(primary, "KS");
+                char_array_append(secondary, "KS");
+            }
+
+            if (substring_equals(str, len, current + 1, 1, 2, "C", "X")) {
+                current += 2;
+            } else {
+                current++;
+            }
+            continue;
+        } else if (c == 'Z') {
+            // Chinese Pinyin e.g. "Zhao"
+            if (get_char_at(str, len, current + 1) == 'H') {
+                char_array_append(primary, "J");
+                char_array_append(secondary, "J");
+                current += 2;
+                continue;
+            } else if (substring_equals(str, len, current + 1, 2, 3, "ZO", "ZI", "ZA")
+                   || (slavo_germanic
+                       && ((current > 0)
+                       && get_char_at(str, len, current - 1) != 'T')))
+            {
+                char_array_append(primary, "S");
+                char_array_append(secondary, "TS");
+            } else {
+                char_array_append(primary, "S");
+                char_array_append(secondary, "S");
+            }
+
+            if (get_char_at(str, len, current + 1) == 'Z') {
+                current += 2;
+            } else {
+                current++;
+            }
+            continue;
+        } else {
+            current++;
+        }
+    }
+
+    double_metaphone_codes_t *codes = calloc(1, sizeof(double_metaphone_codes_t));
+    if (codes == NULL) {
+        char_array_destroy(primary);
+        char_array_destroy(secondary);
+        return NULL;
+    }
+
+    codes->primary = char_array_to_string(primary);
+    codes->secondary = char_array_to_string(secondary);
+
+    free(ptr);
+
+    return codes;
+}
+
+void double_metaphone_codes_destroy(double_metaphone_codes_t *codes) {
+    if (codes != NULL) {
+        if (codes->primary != NULL) {
+            free(codes->primary);
+        }
+
+        if (codes->secondary != NULL) {
+            free(codes->secondary);
+        }
+
+        free(codes);
+    }
+}
--- a/src/double_metaphone.h
+++ b/src/double_metaphone.h
@@ -0,0 +1,17 @@
+#ifndef DOUBLE_METAPHONE__H
+#define DOUBLE_METAPHONE__H
+
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef struct double_metaphone_codes {
+    char *primary;
+    char *secondary;
+} double_metaphone_codes_t;
+
+double_metaphone_codes_t *double_metaphone(char *input);
+
+void double_metaphone_codes_destroy(double_metaphone_codes_t *codes);
+
+#endif
+