[near_dupes] using quadgrams in Latin scripts as well for near dupe hashes

2022-03-25 14:05:03 -04:00
parent 26124ee72f
commit 893745f09b
1 changed files with 1 additions and 0 deletions
--- a/src/near_dupe.c
+++ b/src/near_dupe.c
@@ -387,6 +387,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
                log_debug("token_str = %s\n", token_str);

                add_double_metaphone_to_array_if_unique(token_str, strings, unique_strings, ngrams);
+                add_quadgrams_or_string_to_array_if_unique(token_str, strings, unique_strings, ngrams);
            // For non-Latin words (Arabic, Cyrllic, etc.) just add the word
            // For ideograms, we do two-character shingles, so only add the first character if the string has one token
            } else if (!ideogram || j > 0 || num_tokens == 1) {