libpostal/src/double_metaphone.c

#include <stdarg.h>
#include <string.h>
#include <stdbool.h>

#include "double_metaphone.h"
#include "string_utils.h"
#include "utf8proc/utf8proc.h"

static bool is_vowel(char c) {
    return (c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U' || c == 'Y');
}

static char get_char_at(char *str, size_t len, ssize_t idx) {
    if (idx < 0 || idx >= len) return 0;
    return str[idx];
}

static char *get_string_at(char *str, size_t len, ssize_t idx) {
    if (idx < 0 || idx >= len) return NULL;
    return str + idx;
}

static inline bool is_slavo_germanic(char *s) {
    return strstr(s, "W")
        || strstr(s, "K")
        || strstr(s, "CZ")
        || strstr(s, "WITZ");
}

static inline bool substring_equals(char *str, size_t len, ssize_t index, size_t substr_len, ...) {
    char *string_at_index = get_string_at(str, len, index);
    if (string_at_index == NULL) return false;

    va_list args;
    va_start(args, substr_len);

    bool matched = false;

    while (true) {
        char *sub = va_arg(args, char *);
        if (sub == NULL) break;

        if (utf8_compare_len(string_at_index, sub, substr_len) == 0) {
            matched = true;
            break;
        }
    }

    va_end(args);

    return matched;

}

double_metaphone_codes_t *double_metaphone(char *input) {
    if (input == NULL) return NULL;

    char *ptr = utf8_upper(input);

    /* Note: NFD normalization will help with simple decomposable accent characters
       like "É", "Ü", etc. which effectively become "E\u0301" and "U\u0308". It does
       not handle characters like "Ł". For these, use Latin-ASCII transliteration
       prior to calling this function.

       We can still check for a specific accented character like C with cedilla (Ç),
       by comparing with its decomposed form i.e. "C\xcc\xa7"
    */

    char *normalized = (char *)utf8proc_NFD((utf8proc_uint8_t *)ptr);

    if (normalized != NULL) {
        free(ptr);
        ptr = normalized;
    }

    if (ptr == NULL) {
        return NULL;
    }

    char *str = ptr;

    size_t len = strlen(str);
    char_array *primary = char_array_new_size(len + 1);
    char_array *secondary = char_array_new_size(len + 1);

    bool slavo_germanic = is_slavo_germanic(str);

    size_t current = 0;
    size_t last = len - 1;

    if (substring_equals(str, len, current, 2, "ʻ", NULL)) {
        str += 2;
    } else if (get_char_at(str, len, current) == '\'') {
        str++;
    }

    if (substring_equals(str, len, current, 2, "GN", "KN", "PN", "WR", "PS", NULL)) {
        current++;
    } else if (get_char_at(str, len, current) == 'X') {
        char_array_append(primary, "S");
        char_array_append(secondary, "S");
        current++;
    }

    while (true) {
        char c = *(str + current);
        if (c == '\x00') break;

        if (current == 0 && is_vowel(c)) {
            char_array_append(primary, "A");
            char_array_append(secondary, "A");
            current++;
            continue;
        } else if (c == 'B') {
            /* "-mb", e.g", "dumb", already skipped over... */
            char_array_append(primary, "P");
            char_array_append(secondary, "P");

            if (get_char_at(str, len, current + 1) == 'B') {
                current += 2;
            } else {
                current++;
            }
            continue;
        // Ç - C with cedilla (denormalized)
        } else if (substring_equals(str, len, current, 3, "C\xcc\xa7", NULL)) {
            char_array_append(primary, "S");
            char_array_append(secondary, "S");
            current += 2;
        } else if (c == 'C') {
            // various germanic
            if ((current > 1)
                && !is_vowel(get_char_at(str, len, current - 2))
                && (substring_equals(str, len, current - 1, 3, "ACH", NULL)
                    && !substring_equals(str, len, current + 2, 1, "O", "A", "U", NULL))
                && ((get_char_at(str, len, current + 2) != 'I')
                    && ((get_char_at(str, len, current + 2) != 'E')
                        || substring_equals(str, len, current - 2, 6, "BACHER", "MACHER", NULL))
                   )
                )
            {
                char_array_append(primary, "K");
                char_array_append(secondary, "K");
                current += 2;
                continue;
            }

            // special case for "caesar"
            if ((current == 0)
                && substring_equals(str, len, current, 6, "CAESAR", NULL))
            {
                char_array_append(primary, "S");
                char_array_append(secondary, "K");
                current += 2;
                continue;
            }

            // Italian e.g. "chianti"
            if (substring_equals(str, len, current, 4, "CHIA", NULL)) {
                char_array_append(primary, "K");
                char_array_append(secondary, "K");
                current += 2;
                continue;
            }

            if (substring_equals(str, len, current, 2, "CH", NULL)) {
                // "michael"
                if ((current > 0)
                    && substring_equals(str, len, current, 4, "CHAE", NULL))
                {
                    char_array_append(primary, "K");
                    char_array_append(secondary, "X");
                    current += 2;
                    continue;
                }

                // Greek roots e.g. "chemistry", "chorus"
                if ((current == 0)
                    && (substring_equals(str, len, current + 1, 5, "HARAC", "HARIS", "HOREO", NULL)
                     || substring_equals(str, len, current + 1, 4, "HIRO", "HAOS", "HAOT", NULL)
                     || (substring_equals(str, len, current + 1, 3, "HOR", "HYM", "HIA", "HEM", "HIM", NULL) && !substring_equals(str, len, current + 1, 5, "HEMIN", NULL)))
                    )
                {
                    char_array_append(primary, "K");
                    char_array_append(secondary, "K");
                    current += 2;
                    continue;
                }

                // Germanic, Greek, or otherwise "ch" for "kh" sound
                if (
                      (substring_equals(str, len, 0, 4, "VAN ", "VON ", NULL)
                       || substring_equals(str, len, current - 5, 5, " VAN ", " VON ", NULL)
                       || substring_equals(str, len, 0, 3, "SCH", NULL))
                      // "ochestra", "orchid", "architect" but not "arch"
                      || substring_equals(str, len, current - 2, 6, "ORCHES", "ARCHIT", "ORCHID", NULL)
                      || substring_equals(str, len, current + 2, 1, "T", "S", NULL)
                      || (
                            (((current == 0) || substring_equals(str, len, current - 1, 1, "A", "O", "U", "E", NULL))
                             // e.g. not "breach", "broach", "pouch", "beech", etc.
                             && !substring_equals(str, len, current - 2, 2, "EA", "OU", "EE", "OA", "OO", "AU", NULL)
                             // e.g. not "lunch", "birch", "gulch"
                             && !substring_equals(str, len, current - 1, 1, "L", "R", "N", NULL))
                            // e.g. "wachtler", "wechsler", but not "tichner"
                            && ((current + 1 == last) || substring_equals(str, len, current + 2, 1, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ", NULL))
                         )
                   )
                {
                    char_array_append(primary, "K");
                    char_array_append(secondary, "K");
                } else {
                    if (current > 0) {
                        if (substring_equals(str, len, 0, 2, "MC", NULL)) {
                            char_array_append(primary, "K");
                            char_array_append(secondary, "K");
                        } else {
                            char_array_append(primary, "X");
                            char_array_append(secondary, "K");
                        }
                    } else {
                        char_array_append(primary, "X");
                        char_array_append(secondary, "X");
                    }
                }
                current += 2;
                continue;
            }

            // e.g, "czerny"
            if (substring_equals(str, len, current, 2, "CZ", NULL)
                && !substring_equals(str, len, current - 2, 4, "WICZ", NULL))
            {
                char_array_append(primary, "S");
                char_array_append(secondary, "X");
                current += 2;
                continue;
            }

            // double 'C' but not if e.g. "McClellan"
            if (substring_equals(str, len, current, 2, "CC", NULL)
                && !((current == 1) && get_char_at(str, len, 0) == 'M'))
            {
                // "bellocchio" but not "bacchus"
                if (substring_equals(str, len, current + 2, 1, "I", "E", "H", NULL)
                    && !substring_equals(str, len, current + 2, 3, "HUS", "HUM", "HUN", "HAN", NULL))
                {
                    // "accident", "accede", "succeed"
                    if (((current == 1)
                         && (get_char_at(str, len, current - 1) == 'A'))
                        || substring_equals(str, len, current - 1, 5, "UCCEE", "UCCES", NULL))
                    {
                        char_array_append(primary, "KS");
                        char_array_append(secondary, "KS");
                    // "pinocchio" but not "riccio" or "picchu"
                    } else if (get_char_at(str, len, current + 2) == 'H'
                           && !substring_equals(str, len, current + 2, 2, "HU", "HA", NULL)) {
                        char_array_append(primary, "K");
                        char_array_append(secondary, "X");
                    } else {
                        char_array_append(primary, "X");
                        char_array_append(secondary, "X");
                    }
                    current += 3;
                    continue;
                } else {
                    // Pierce's rule
                    char_array_append(primary, "K");
                    char_array_append(secondary, "K");
                    current += 2;
                    continue;
                }
            }

            if (substring_equals(str, len, current, 2, "CK", "CG", "CQ", NULL)) {
                char_array_append(primary, "K");
                char_array_append(secondary, "K");
                current += 2;
                continue;
            }

            if (substring_equals(str, len, current, 2, "CI", "CJ", "CE", "CY", NULL)) {
                if (substring_equals(str, len, current, 3, "CIO", "CIE", "CIA", "CIU", NULL)) {
                    char_array_append(primary, "S");
                    char_array_append(secondary, "X");
                } else {
                    char_array_append(primary, "S");
                    char_array_append(secondary, "S");
                }
                current += 2;
                continue;
            }

            // else
            char_array_append(primary, "K");
            char_array_append(secondary, "K");

            if (substring_equals(str, len, current + 1, 2, " C", " Q", " G", NULL)) {
                current += 3;
            } else if (substring_equals(str, len, current + 1, 1, "C", "K", "Q", NULL)
                   && !substring_equals(str, len, current + 1, 2, "CE", "CI", NULL))
            {
                current += 2;
            } else {
                current++;
            }

            continue;
        } else if (c == 'D') {
            if (substring_equals(str, len, current, 2, "DG", NULL)) {
                if (substring_equals(str, len, current + 2, 1, "I", "E", "Y", NULL)) {
                    // e.g. "edge"
                    char_array_append(primary, "J");
                    char_array_append(secondary, "J");
                    current += 3;
                    continue;
                } else {
                    char_array_append(primary, "TK");
                    char_array_append(secondary, "TK");
                    current += 2;
                    continue;
                }
            }

            if (substring_equals(str, len, current, 2, "DT", "DD", NULL)) {
                char_array_append(primary, "T");
                char_array_append(secondary, "T");
                current += 2;
                continue;
            }

            // else
            char_array_append(primary, "T");
            char_array_append(secondary, "T");
            current++;
            continue;
        } else if (c == 'F') {
            if (get_char_at(str, len, current + 1) == 'F') {
                current += 2;
            } else {
                current++;
            }

            char_array_append(primary, "F");
            char_array_append(secondary, "F");
            continue;
        } else if (c == 'G') {
            if (get_char_at(str, len, current + 1) == 'H') {
                if ((current > 0) && !is_vowel(get_char_at(str, len, current - 1))) {
                    char_array_append(primary, "K");
                    char_array_append(secondary, "K");
                    current += 2;
                    continue;
                }

                if (current == 0) {
                    // "ghislane", "ghiradelli"
                    if (get_char_at(str, len, current + 2) == 'I') {
                        char_array_append(primary, "J");
                        char_array_append(secondary, "J");
                    } else {
                        char_array_append(primary, "K");
                        char_array_append(secondary, "K");
                    }
                    current += 2;
                    continue;
                }

                // Parker's rule (with some further refinements) - e.g. "hugh"
                if (
                    ((current > 1)
                     && substring_equals(str, len, current - 2, 1, "B", "H", "D", NULL))
                    // e.g. "bough"
                    || ((current > 2)
                        && substring_equals(str, len, current - 3, 1, "B", "H", "D", NULL))
                    // e.g. "broughton"
                    || ((current > 3)
                        && substring_equals(str, len, current - 4, 1, "B", "H", NULL))
                    )
                {
                    current += 2;
                    continue;
                } else {
                    // e.g. "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
                    if ((current > 2)
                        && (get_char_at(str, len, current - 1) == 'U')
                        && substring_equals(str, len, current - 3, 1, "C", "G", "L", "R", "T", NULL))
                    {
                        char_array_append(primary, "F");
                        char_array_append(secondary, "F");
                    } else if ((current > 0)
                               && get_char_at(str, len, current - 1) != 'I')
                    {
                        char_array_append(primary, "K");
                        char_array_append(secondary, "K");
                    }
                    current += 2;
                    continue;
                }

            }

            if (get_char_at(str, len, current + 1) == 'N') {
                if ((current == 1) && is_vowel(get_char_at(str, len, 0))
                    && !slavo_germanic)
                {
                    char_array_append(primary, "KN");
                    char_array_append(secondary, "N");
                // not e.g. "cagney"
                } else if (!substring_equals(str, len, current + 2, 2, "EY", NULL)
                        && (get_char_at(str, len, current + 1) != 'Y')
                        && !slavo_germanic)
                {
                    char_array_append(primary, "N");
                    char_array_append(secondary, "KN");
                } else {
                    char_array_append(primary, "KN");
                    char_array_append(secondary, "KN");
                }
                current += 2;
                continue;
            }

            // "tagliaro"
            if (substring_equals(str, len, current + 1, 2, "LI", NULL)
                && !slavo_germanic)
            {
                char_array_append(primary, "KL");
                char_array_append(secondary, "L");
                current += 2;
                continue;
            }

            // -ges-, -gep-, -gel-, -gie- at beginning
            if ((current == 0)
                && ((get_char_at(str, len, current + 1) == 'Y')
                    || substring_equals(str, len, current + 1, 2, "ES", "EP",
                                        "EB", "EL", "EY", "IB", "IL", "IN", "IE",
                                        "EI", "ER", NULL)))
            {
                char_array_append(primary, "K");
                char_array_append(secondary, "J");
                current += 2;
                continue;
            }

            // -ger-, -gy-
            if (
                (substring_equals(str, len, current + 1, 2, "ER", NULL)
                 || (get_char_at(str, len, current + 1) == 'Y'))
                && !substring_equals(str, len, 0, 6, "DANGER", "RANGER", "MANGER", NULL)
                && !substring_equals(str, len, current - 1, 1, "E", "I", NULL)
                && !substring_equals(str, len, current - 1, 3, "RGY", "OGY", NULL)
                )
            {
                char_array_append(primary, "K");
                char_array_append(secondary, "J");
                current += 2;
                continue;
            }

            // italian e.g. "viaggi"
            if (substring_equals(str, len, current + 1, 1, "E", "I", "Y", NULL)
                || substring_equals(str, len, current - 1, 4, "AGGI", "OGGI", NULL))
            {
                // obvious germanic
                if (
                    (substring_equals(str, len, 0, 4, "VAN ", "VON ", NULL)
                     || substring_equals(str, len, current - 5, 5, " VAN ", " VON ", NULL)
                     || substring_equals(str, len, 0, 3, "SCH", NULL))
                    || substring_equals(str, len, current + 1, 2, "ET", NULL))
                {
                    char_array_append(primary, "K");
                    char_array_append(secondary, "K");

                } else {
                    if (substring_equals(str, len, current + 1, 4, "IER ", NULL)
                        || ((current == len - 3) && substring_equals(str, len, current + 1, 3, "IER", NULL)))
                    {
                        char_array_append(primary, "J");
                        char_array_append(secondary, "J");
                    } else {
                        char_array_append(primary, "J");
                        char_array_append(secondary, "K");
                    }
                }
                current += 2;
                continue;
            }

            if (get_char_at(str, len, current + 1) == 'G') {
                current += 2;
            } else {
                current++;
            }

            char_array_append(primary, "K");
            char_array_append(secondary, "K");
            continue;
        } else if (c == 'H') {
            // only keep if first & before vowel or between 2 vowels
            if (((current == 0) || is_vowel(get_char_at(str, len, current - 1)))
                && is_vowel(get_char_at(str, len, current + 1)))
            {
                char_array_append(primary, "H");
                char_array_append(secondary, "H");
                current += 2;
            // also takes care of "HH"
            } else {
                current++;
            }
            continue;
        } else if (c == 'J') {
            // obvious Spanish, "Jose", "San Jacinto"
            if (substring_equals(str, len, current, 4, "JOSE", NULL)
                || substring_equals(str, len, current, 5, "JOSÉ", NULL)
                || substring_equals(str, len, 0, 4, "SAN ", NULL))
            {
                if (((current == 0)
                     && (get_char_at(str, len, current + 4) == ' '))
                    || substring_equals(str, len, 0, 4, "SAN ", NULL))
                {
                    char_array_append(primary, "H");
                    char_array_append(secondary, "H");
                } else {
                    char_array_append(primary, "J");
                    char_array_append(secondary, "H");
                }

                current++;
                continue;
            }

            if ((current == 0)
                && !substring_equals(str, len, current, 4, "JOSE", NULL)
                && !substring_equals(str, len, current, 5, "JOSÉ", NULL))
            {
                // Yankelovich/Jankelowicz
                char_array_append(primary, "J");
                char_array_append(secondary, "A");
                current++;
                continue;
            } else {
                // Spanish pronoun of e.g. "bajador"
                if (is_vowel(get_char_at(str, len, current - 1))
                    && !slavo_germanic
                    && ((get_char_at(str, len, current + 1) == 'A')
                        || (get_char_at(str, len, current + 1) == 'O')))
                {
                    char_array_append(primary, "J");
                    char_array_append(secondary, "H");
                } else {
                    if (current == last || ((current == last - 1 || get_char_at(str, len, current + 2) == ' ') && isalpha(get_char_at(str, len, current - 1)) && substring_equals(str, len, current + 1, 1, "A", "O", NULL))) {
                        char_array_append(primary, "J");
                    } else {
                        if (!substring_equals(str, len, current + 1, 1, "L", "T",
                                              "K", "S", "N", "M", "B", "Z", NULL)
                            && !substring_equals(str, len, current - 1, 1, "S", "K", "L", NULL))
                        {
                            char_array_append(primary, "J");
                            char_array_append(secondary, "J");
                        }
                    }
                }

                // it could happen!
                if (get_char_at(str, len, current + 1) == 'J') {
                    current += 2;
                } else {
                    current++;
                }
                continue;
            }
        } else if (c == 'K') {
            if (get_char_at(str, len, current + 1) == 'K') {
                current += 2;
            } else {
                current++;
            }

            char_array_append(primary, "K");
            char_array_append(secondary, "K");
            continue;
        } else if (c == 'L') {
            if (get_char_at(str, len, current + 1) == 'L') {
                // Spanish e.g. "Cabrillo", "Gallegos"
                if (((current == (len - 3))
                     && substring_equals(str, len, current - 1, 4, "ILLO", "ILLA", "ALLE", NULL))
                    || ((substring_equals(str, len, last - 1, 2, "AS", "OS", NULL)
                      || substring_equals(str, len, last, 1, "A", "O", NULL))
                      && substring_equals(str, len, current - 1, 4, "ALLE", NULL)
                      )
                    )
                {
                    char_array_append(primary, "L");
                    current += 2;
                    continue;
                }

                current += 2;
            } else {
                current++;
            }
            char_array_append(primary, "L");
            char_array_append(secondary, "L");
            continue;
        } else if (c == 'M') {
            if ((substring_equals(str, len, current - 1, 3, "UMB", NULL)
                && (((current + 1) == last)
                    || substring_equals(str, len, current + 2, 2, "ER", NULL)))
                || (get_char_at(str, len, current + 1) == 'M'))
            {
                current += 2;
            } else {
                current++;
            }
            char_array_append(primary, "M");
            char_array_append(secondary, "M");
            continue;
        // Ñ (NFD normalized)
        } else if (substring_equals(str, len, current, 3, "N\xcc\x83", NULL)) {
            current += 3;
            char_array_append(primary, "N");
            char_array_append(secondary, "N");
            continue;
        } else if (c == 'N') {
            if (get_char_at(str, len, current + 1) == 'N')  {
                current += 2;
            } else {
                current++;
            }

            char_array_append(primary, "N");
            char_array_append(secondary, "N");
            continue;
        } else if (c == 'P') {
            if (substring_equals(str, len, current + 1, 1, "H", "F", NULL)) {
                char_array_append(primary, "F");
                char_array_append(secondary, "F");
                current += 2;
                continue;
            }

            // also account for "Campbell", "raspberry"
            if (substring_equals(str, len, current + 1, 1, "P", "B", NULL)) {
                current += 2;
            } else {
                current++;
            }

            char_array_append(primary, "P");
            char_array_append(secondary, "P");
            continue;
        } else if (c == 'Q') {
            if (get_char_at(str, len, current + 1) == 'Q') {
                current += 2;
            } else {
                current += 1;
            }

            char_array_append(primary, "K");
            char_array_append(secondary, "K");
            continue;
        } else if (c == 'R') {
            // french e.g. "rogier", but exclude "hochmeier"
            if ((current == last)
                && !slavo_germanic
                && substring_equals(str, len, current - 2, 2, "IE", NULL)
                && !substring_equals(str, len, current - 4, 2, "ME", "MA", NULL))
            {
                char_array_append(secondary, "R");
            } else {
                char_array_append(primary, "R");
                char_array_append(secondary, "R");
            }

            if (get_char_at(str, len, current + 1) == 'R') {
                current += 2;
            } else {
                current++;
            }
            continue;
        } else if (c == 'S') {
            // special cases "island", "isle", "carlisle", "carlysle"
            if (substring_equals(str, len, current - 1, 3, "ISL", "YSL", NULL)) {
                current++;
                continue;
            }

            // special case "sugar-"
            if ((current == 0)
                && substring_equals(str, len, current, 5, "SUGAR", NULL))
            {
                char_array_append(primary, "X");
                char_array_append(secondary, "S");
                current++;
                continue;
            }

            if (substring_equals(str, len, current, 2, "SH", NULL)) {
                // Germanic
                if (substring_equals(str, len, current + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ", NULL)) {
                    char_array_append(primary, "S");
                    char_array_append(secondary, "S");
                } else {
                    char_array_append(primary, "X");
                    char_array_append(secondary, "X");
                }
                current += 2;
                continue;
            }

            // Italian & Armenian
            if (substring_equals(str, len, current, 3, "SIO", "SIA", NULL)
                || substring_equals(str, len, current, 4, "SIAN", NULL))
            {
                if (!slavo_germanic) {
                    char_array_append(primary, "S");
                    char_array_append(secondary, "X");
                } else {
                    char_array_append(primary, "S");
                    char_array_append(secondary, "S");
                }
                current += 3;
                continue;
            }

            /* German & Anglicisations, e.g. "Smith" match "Schmidt", "Snider" match "Schneider"
               also, -sz- in Slavic language although in Hungarian it is pronounced 's' */
            if (((current == 0)
                 && substring_equals(str, len, current + 1, 1, "M", "N", "L", "W", NULL))
                || substring_equals(str, len, current + 1, 1, "Z", NULL))
            {
                char_array_append(primary, "S");
                char_array_append(secondary, "X");
                if (substring_equals(str, len, current + 1, 1, "Z", NULL)) {
                    current += 2;
                } else {
                    current++;
                }
                continue;
            }


            if (substring_equals(str, len, current, 2, "SC", NULL)) {
                // Schlesinger's rule
                if (get_char_at(str, len, current + 2) == 'H') {
                    // Dutch origin e.g. "school", "schooner"
                    if (substring_equals(str, len, current + 3, 2, "OO", "ER", "EN",
                                         "UY", "ED", "EM", NULL))
                    {
                        // "Schermerhorn", "Schenker"
                        if (substring_equals(str, len, current + 3, 2, "ER", "EN", NULL)) {
                            char_array_append(primary, "X");
                            char_array_append(secondary, "SK");
                        } else {
                            char_array_append(primary, "SK");
                            char_array_append(secondary, "SK");
                        }
                        current += 3;
                        continue;
                    } else {
                        if ((current == 0) && !is_vowel(get_char_at(str, len, 3))
                            && (get_char_at(str, len, 3) != 'W'))
                        {
                            char_array_append(primary, "X");
                            char_array_append(secondary, "S");
                        } else {
                            char_array_append(primary, "X");
                            char_array_append(secondary, "X");
                        }
                        current += 3;
                        continue;
                    }

                    if (substring_equals(str, len, current + 2, 1, "I", "E", "Y", NULL)) {
                        char_array_append(primary, "S");
                        char_array_append(secondary, "S");
                        current += 3;
                        continue;
                    }

                    char_array_append(primary, "SK");
                    char_array_append(secondary, "SK");
                    current += 3;
                    continue;
                }
            }

            // French e.g. "resnais", "artois"
            if ((current == last)
                && substring_equals(str, len, current - 2, 2, "AI", "OI", NULL))
            {
                char_array_append(secondary, "S");
            } else {
                char_array_append(primary, "S");
                char_array_append(secondary, "S");
            }

            if (substring_equals(str, len, current + 1, 1, "S", "Z", NULL)) {

                current += 2;
            } else {
                current++;
            }
            continue;
        } else if (c == 'T') {

            if (substring_equals(str, len, current, 4, "TION", NULL)) {
                char_array_append(primary, "X");
                char_array_append(secondary, "X");
                current += 3;
                continue;
            }

            if (substring_equals(str, len, current, 3, "TIA", "TCH", NULL)) {
                char_array_append(primary, "X");
                char_array_append(secondary, "X");
                current += 3;
                continue;
            }

            if (substring_equals(str, len, current, 2, "TH", NULL)
                || substring_equals(str, len, current, 3, "TTH", NULL))
            {
                // special case "Thomas", "Thames", or Germanic
                if (substring_equals(str, len, current + 2, 2, "OM", "AM", NULL)
                 || substring_equals(str, len, 0, 4, "VAN ", "VON ", NULL)
                 || substring_equals(str, len, current - 5, 5, " VAN ", " VON ", NULL)
                 || substring_equals(str, len, 0, 3, "SCH", NULL))
                {
                    char_array_append(primary, "T");
                    char_array_append(secondary, "T");
                } else {
                    // yes, zero
                    char_array_append(primary, "0");
                    char_array_append(secondary, "T");
                }

                current += 2;
                continue;
            }

            if (substring_equals(str, len, current + 1, 1, "T", "D", NULL)) {
                current += 2;
            } else {
                current++;
            }

            char_array_append(primary, "T");
            char_array_append(secondary, "T");
            continue;
        } else if (c == 'V') {
            if (get_char_at(str, len, current + 1) == 'V') {
                current += 2;
            } else {
                current++;
            }

            char_array_append(primary, "F");
            char_array_append(secondary, "F");
            continue;
        } else if (c == 'W') {
            // can also be in the middle of word
            if (substring_equals(str, len, current, 2, "WR", NULL)) {
                char_array_append(primary, "R");
                char_array_append(secondary, "R");
                current += 2;
                continue;
            }

            if ((current == 0)
                && (is_vowel(get_char_at(str, len, current + 1))
                || substring_equals(str, len, current, 2, "WH", NULL)))
            {
                // Wasserman should match Vasserman
                if (is_vowel(get_char_at(str, len, current + 1))) {
                    char_array_append(primary, "A");
                    char_array_append(secondary, "F");
                } else {
                    // need Uomo to match Womo
                    char_array_append(primary, "A");
                    char_array_append(secondary, "A");
                }
            }

            // Arnow should match Arnoff
            if (((current == last) && is_vowel(get_char_at(str, len, current - 1)))
                || substring_equals(str, len, current - 1, 5, "EWSKI", "EWSKY",
                                    "OWSKI", "OWSKY", NULL)
                || substring_equals(str, len, 0, 3, "SCH", NULL))
            {
                char_array_append(secondary, "F");
                current++;
                continue;
            }

            // Polish e.g. "Filipowicz"
            if (substring_equals(str, len, current, 4, "WICZ", "WITZ", NULL)) {
                char_array_append(primary, "TS");
                char_array_append(secondary, "FX");
                current += 4;
                continue;
            }

            // else skip it
            current++;
            continue;
        } else if (c == 'X') {
            // French e.g. "breaux"
            if (!((current == last)
                  && (substring_equals(str, len, current - 3, 3, "IAU", "EAU", NULL)
                   || substring_equals(str, len, current - 2, 2, "AU", "OU", NULL))))
            {
                char_array_append(primary, "KS");
                char_array_append(secondary, "KS");
            }

            if (substring_equals(str, len, current + 1, 1, "C", "X", NULL)) {
                current += 2;
            } else {
                current++;
            }
            continue;
        } else if (c == 'Z') {
            // Chinese Pinyin e.g. "Zhao"
            if (get_char_at(str, len, current + 1) == 'H') {
                char_array_append(primary, "J");
                char_array_append(secondary, "J");
                current += 2;
                continue;
            } else if (substring_equals(str, len, current + 1, 2, "ZO", "ZI", "ZA", NULL)
                   || (slavo_germanic
                       && ((current > 0)
                       && get_char_at(str, len, current - 1) != 'T')))
            {
                char_array_append(primary, "S");
                char_array_append(secondary, "TS");
            } else {
                char_array_append(primary, "S");
                char_array_append(secondary, "S");
            }

            if (get_char_at(str, len, current + 1) == 'Z') {
                current += 2;
            } else {
                current++;
            }
            continue;
        } else {
            current++;
        }
    }

    double_metaphone_codes_t *codes = calloc(1, sizeof(double_metaphone_codes_t));
    if (codes == NULL) {
        char_array_destroy(primary);
        char_array_destroy(secondary);
        return NULL;
    }

    codes->primary = char_array_to_string(primary);
    codes->secondary = char_array_to_string(secondary);

    free(ptr);

    return codes;
}

void double_metaphone_codes_destroy(double_metaphone_codes_t *codes) {
    if (codes != NULL) {
        if (codes->primary != NULL) {
            free(codes->primary);
        }

        if (codes->secondary != NULL) {
            free(codes->secondary);
        }

        free(codes);
    }
}