From 893745f09b89842ca528ba501ba5acc4db9dd6f5 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 25 Mar 2022 14:05:03 -0400 Subject: [PATCH] [near_dupes] using quadgrams in Latin scripts as well for near dupe hashes --- src/near_dupe.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/near_dupe.c b/src/near_dupe.c index 45f7c536..06a89ac4 100644 --- a/src/near_dupe.c +++ b/src/near_dupe.c @@ -387,6 +387,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal log_debug("token_str = %s\n", token_str); add_double_metaphone_to_array_if_unique(token_str, strings, unique_strings, ngrams); + add_quadgrams_or_string_to_array_if_unique(token_str, strings, unique_strings, ngrams); // For non-Latin words (Arabic, Cyrllic, etc.) just add the word // For ideograms, we do two-character shingles, so only add the first character if the string has one token } else if (!ideogram || j > 0 || num_tokens == 1) {