From c61007388bde7cb92782747fe83ec80e97c7a244 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 18 Oct 2017 04:00:57 -0400 Subject: [PATCH] [similarity] bug fixes and additional French, Spanish, Italian, and Slavic phonetics --- src/double_metaphone.c | 107 +++++++++++++++++++++-------------------- 1 file changed, 54 insertions(+), 53 deletions(-) diff --git a/src/double_metaphone.c b/src/double_metaphone.c index e28264e2..d911b22c 100644 --- a/src/double_metaphone.c +++ b/src/double_metaphone.c @@ -54,12 +54,20 @@ static inline bool substring_equals(char *str, size_t len, ssize_t index, size_t } - double_metaphone_codes_t *double_metaphone(char *input) { if (input == NULL) return NULL; char *ptr = utf8_upper(input); + /* Note: NFD normalization will help with simple decomposable accent characters + like "É", "Ü", etc. which effectively become "E\u0301" and "U\u0308". It does + not handle characters like "Ł". For these, use Latin-ASCII transliteration + prior to calling this function. + + We can still check for a specific accented character like C with cedilla (Ç), + by comparing with its decomposed form i.e. "C\xcc\xa7" + */ + char *normalized = (char *)utf8proc_NFD((utf8proc_uint8_t *)ptr); if (normalized != NULL) { @@ -100,7 +108,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { char c = *(str + current); if (c == '\x00') break; - if (is_vowel(c) && current == 0) { + if (current == 0 && is_vowel(c)) { char_array_append(primary, "A"); char_array_append(secondary, "A"); current++; @@ -116,8 +124,8 @@ double_metaphone_codes_t *double_metaphone(char *input) { current++; } continue; - // C with cedilla (denormalized) - } else if (substring_equals(str, len, current, 3, 2, "C\xcc\xa7", "Ç")) { + // Ç - C with cedilla (denormalized) + } else if (substring_equals(str, len, current, 3, 1, "C\xcc\xa7")) { char_array_append(primary, "S"); char_array_append(secondary, "S"); current += 2; @@ -125,7 +133,8 @@ double_metaphone_codes_t *double_metaphone(char *input) { // various germanic if ((current > 1) && !is_vowel(get_char_at(str, len, current - 2)) - && substring_equals(str, len, current - 1, 3, 1, "ACH") + && (substring_equals(str, len, current - 1, 3, 1, "ACH") + && !substring_equals(str, len, current + 2, 1, 3, "O", "A", "U")) && ((get_char_at(str, len, current + 2) != 'I') && ((get_char_at(str, len, current + 2) != 'E') || substring_equals(str, len, current - 2, 6, 2, "BACHER", "MACHER")) @@ -162,7 +171,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { && substring_equals(str, len, current, 4, 1, "CHAE")) { char_array_append(primary, "K"); - char_array_append(secondary, "K"); + char_array_append(secondary, "X"); current += 2; continue; } @@ -171,7 +180,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { if ((current == 0) && (substring_equals(str, len, current + 1, 5, 3, "HARAC", "HARIS", "HOREO") || substring_equals(str, len, current + 1, 4, 3, "HIRO", "HAOS", "HAOT") - || substring_equals(str, len, current + 1, 3, 5, "HOR", "HYM", "HIA", "HEM", "HIM")) + || (substring_equals(str, len, current + 1, 3, 5, "HOR", "HYM", "HIA", "HEM", "HIM") && !substring_equals(str, len, current + 1, 5, 2, "HEMIN"))) ) { char_array_append(primary, "K"); @@ -186,19 +195,21 @@ double_metaphone_codes_t *double_metaphone(char *input) { || substring_equals(str, len, current - 5, 5, 2, " VAN ", " VON ") || substring_equals(str, len, 0, 3, 1, "SCH")) // "ochestra", "orchid", "architect" but not "arch" - || substring_equals(str, len, current - 2, 6, 1, "ORCHES", "ARCHIT", "ORCHID") + || substring_equals(str, len, current - 2, 6, 3, "ORCHES", "ARCHIT", "ORCHID") || substring_equals(str, len, current + 2, 1, 2, "T", "S") || ( - ((current == 0) || substring_equals(str, len, current - 1, 1, 4, "A", "O", "U", "E")) + (((current == 0) || substring_equals(str, len, current - 1, 1, 4, "A", "O", "U", "E")) + // e.g. not "breach", "broach", "pouch", "beech", etc. + && !substring_equals(str, len, current - 2, 2, 6, "EA", "OU", "EE", "OA", "OO", "AU") + // e.g. not "lunch", "birch", "gulch" + && !substring_equals(str, len, current - 1, 1, 3, "L", "R", "N")) // e.g. "wachtler", "wechsler", but not "tichner" - && substring_equals(str, len, current + 2, 1, 10, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ") + && ((current + 1 == last) || substring_equals(str, len, current + 2, 1, 10, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ")) ) ) { char_array_append(primary, "K"); char_array_append(secondary, "K"); - current += 2; - continue; } else { if (current > 0) { if (substring_equals(str, len, 0, 2, 1, "MC")) { @@ -210,7 +221,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { } } else { char_array_append(primary, "X"); - char_array_append(secondary, "K"); + char_array_append(secondary, "X"); } } current += 2; @@ -227,21 +238,13 @@ double_metaphone_codes_t *double_metaphone(char *input) { continue; } - // e.g. "focaccia" - if (substring_equals(str, len, current + 1, 3, 1, "CIA")) { - char_array_append(primary, "X"); - char_array_append(secondary, "X"); - current += 3; - continue; - } - // double 'C' but not if e.g. "McClellan" if (substring_equals(str, len, current, 2, 1, "CC") && !((current == 1) && get_char_at(str, len, 0) == 'M')) { // "bellocchio" but not "bacchus" if (substring_equals(str, len, current + 2, 1, 3, "I", "E", "H") - && !substring_equals(str, len, current + 2, 2, 1, "HU")) + && !substring_equals(str, len, current + 2, 3, 4, "HUS", "HUM", "HUN", "HAN")) { // "accident", "accede", "succeed" if (((current == 1) @@ -250,18 +253,24 @@ double_metaphone_codes_t *double_metaphone(char *input) { { char_array_append(primary, "KS"); char_array_append(secondary, "KS"); + // "pinocchio" but not "riccio" or "picchu" + } else if (get_char_at(str, len, current + 2) == 'H' + && !substring_equals(str, len, current + 2, 2, 2, "HU", "HA")) { + char_array_append(primary, "K"); + char_array_append(secondary, "X"); } else { char_array_append(primary, "X"); char_array_append(secondary, "X"); } current += 3; continue; + } else { + // Pierce's rule + char_array_append(primary, "K"); + char_array_append(secondary, "K"); + current += 2; + continue; } - } else { - char_array_append(primary, "K"); - char_array_append(secondary, "K"); - current += 2; - continue; } if (substring_equals(str, len, current, 2, 3, "CK", "CG", "CQ")) { @@ -271,8 +280,8 @@ double_metaphone_codes_t *double_metaphone(char *input) { continue; } - if (substring_equals(str, len, current, 2, 3, "CI", "CE", "CY")) { - if (substring_equals(str, len, current, 3, 3, "CIO", "CIE", "CIA")) { + if (substring_equals(str, len, current, 2, 4, "CI", "CJ", "CE", "CY")) { + if (substring_equals(str, len, current, 3, 5, "CIO", "CIE", "CIA", "CIU")) { char_array_append(primary, "S"); char_array_append(secondary, "X"); } else { @@ -297,11 +306,6 @@ double_metaphone_codes_t *double_metaphone(char *input) { current++; } - continue; - } else if (substring_equals(str, len, current, 2, 1, "Đ")) { - char_array_append(primary, "T"); - char_array_append(secondary, "T"); - current += 2; continue; } else if (c == 'D') { if (substring_equals(str, len, current, 2, 1, "DG")) { @@ -350,7 +354,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { continue; } - if (current < 3) { + if (current == 0) { // "ghislane", "ghiradelli" if (get_char_at(str, len, current + 2) == 'I') { char_array_append(primary, "J"); @@ -386,7 +390,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(primary, "F"); char_array_append(secondary, "F"); } else if ((current > 0) - && get_char_at(str, len, current - 1) == 'I') + && get_char_at(str, len, current - 1) != 'I') { char_array_append(primary, "K"); char_array_append(secondary, "K"); @@ -399,7 +403,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { if (get_char_at(str, len, current + 1) == 'N') { if ((current == 1) && is_vowel(get_char_at(str, len, 0)) - && slavo_germanic) + && !slavo_germanic) { char_array_append(primary, "KN"); char_array_append(secondary, "N"); @@ -431,7 +435,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { // -ges-, -gep-, -gel-, -gie- at beginning if ((current == 0) && ((get_char_at(str, len, current + 1) == 'Y') - || substring_equals(str, len, current + 1, 2, 13, "ES", "EP", + || substring_equals(str, len, current + 1, 2, 11, "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER"))) { @@ -469,6 +473,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { { char_array_append(primary, "K"); char_array_append(secondary, "K"); + } else { if (substring_equals(str, len, current + 1, 4, 1, "IER ") || ((current == len - 3) && substring_equals(str, len, current + 1, 3, 1, "IER"))) @@ -479,9 +484,9 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(primary, "J"); char_array_append(secondary, "K"); } - current += 2; - continue; } + current += 2; + continue; } if (get_char_at(str, len, current + 1) == 'G') { @@ -546,7 +551,7 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(primary, "J"); char_array_append(secondary, "H"); } else { - if (current == last) { + if (current == last || ((current == last - 1 || get_char_at(str, len, current + 2) == ' ') && isalpha(get_char_at(str, len, current - 1)) && substring_equals(str, len, current + 1, 1, 2, "A", "O"))) { char_array_append(primary, "J"); } else { if (!substring_equals(str, len, current + 1, 1, 8, "L", "T", @@ -559,7 +564,6 @@ double_metaphone_codes_t *double_metaphone(char *input) { } } - // it could happen! if (get_char_at(str, len, current + 1) == 'J') { current += 2; @@ -578,11 +582,6 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(primary, "K"); char_array_append(secondary, "K"); continue; - } else if (substring_equals(str, len, current, 2, 1, "Ł")) { - current += 2; - char_array_append(primary, "L"); - char_array_append(secondary, "L"); - continue; } else if (c == 'L') { if (get_char_at(str, len, current + 1) == 'L') { // Spanish e.g. "Cabrillo", "Gallegos" @@ -619,6 +618,12 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(primary, "M"); char_array_append(secondary, "M"); continue; + // Ñ (NFD normalized) + } else if (substring_equals(str, len, current, 3, 1, "N\xcc\x83")) { + current += 3; + char_array_append(primary, "N"); + char_array_append(secondary, "N"); + continue; } else if (c == 'N') { if (get_char_at(str, len, current + 1) == 'N') { current += 2; @@ -626,11 +631,6 @@ double_metaphone_codes_t *double_metaphone(char *input) { current++; } - char_array_append(primary, "N"); - char_array_append(secondary, "N"); - continue; - } else if (substring_equals(str, len, current, 2, 1, "Ñ")) { - current += 2; char_array_append(primary, "N"); char_array_append(secondary, "N"); continue; @@ -798,7 +798,8 @@ double_metaphone_codes_t *double_metaphone(char *input) { char_array_append(secondary, "S"); } - if (substring_equals(str, len, current - 1, 1, 2, "S", "Z")) { + if (substring_equals(str, len, current + 1, 1, 2, "S", "Z")) { + current += 2; } else { current++;