From 03e5e25240a1f6f023d77c7c251eb437a4e8a967 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 16 Jan 2018 03:07:32 -0500 Subject: [PATCH] [dedupe] adding a near-dupe hash which takes into account existing acronyms which may have appeared in the string, either known acronyms as defined in the dictionaries like "HS" and includes the full token in the acronym. This feature is particularly useful for public schools or other cases where the canonical string may be used i.e. "Foo High School", "Foo HS" and "FHS". It also does the same thing other acronyms that are identified by the tokenizer from the internal period structure like A.B.C. Also now allowing mixed alpha-numeric tokens to use the double metaphone encoding as well, and for numeric tokens with script=Common (digits but may also contain hyphens, etc.), the full token is included as one of the words rather than quadgrams, which don't make sense for numerics. --- src/near_dupe.c | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/src/near_dupe.c b/src/near_dupe.c index 081e6aa6..7f54cc1e 100644 --- a/src/near_dupe.c +++ b/src/near_dupe.c @@ -9,6 +9,7 @@ #include "expand.h" #include "features.h" #include "float_utils.h" +#include "normalize.h" #include "ngrams.h" #include "place.h" #include "scanner.h" @@ -183,7 +184,6 @@ static inline cstring_array *expanded_component_root_with_fallback(char *input, } } - static cstring_array *geohash_and_neighbors(double latitude, double longitude, size_t geohash_precision) { if (geohash_precision == 0) return NULL; @@ -301,6 +301,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal token_array *token_array = token_array_new(); uint32_array *stopwords_array = uint32_array_new(); + uint32_array *existing_acronyms_array = uint32_array_new(); char_array *combined_words_no_whitespace = char_array_new(); @@ -332,6 +333,9 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal string_script_t token_script = get_string_script(expansion + token.offset, token.len); bool is_latin = token_script.len == token.len && token_script.script == SCRIPT_LATIN; + bool is_common_script = token_script.len == token.len && token_script.script == SCRIPT_COMMON; + bool is_numeric = is_numeric_token(token.type); + char_array_clear(token_string_array); // For ideograms, since the "words" are characters, we use shingles of two characters if (ideogram && j > 0 && is_ideographic(prev_token.type)) { @@ -342,7 +346,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal char_array_cat_len(combined_words_no_whitespace, expansion + token.offset, token.len); // For Latin script, add double metaphone of the words - if (is_latin && !is_numeric_token(token.type) && !ideogram && !is_punctuation(token.type)) { + if (is_latin && !(is_numeric && is_common_script) && !ideogram && !is_punctuation(token.type)) { char_array_clear(token_string_array); char_array_cat_len(token_string_array, expansion + token.offset, token.len); token_str = char_array_get_string(token_string_array); @@ -357,7 +361,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal token_str = char_array_get_string(token_string_array); log_debug("token_str = %s\n", token_str); - if (!ideogram) { + if (!ideogram && !(is_numeric || is_common_script)) { add_quadgrams_or_string_to_array_if_unique(token_str, strings, unique_strings, ngrams); } else { add_string_to_array_if_unique(token_str, strings, unique_strings); @@ -381,7 +385,10 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal keep_whitespace = false; tokenize_add_tokens(token_array, normalized, strlen(normalized), keep_whitespace); stopword_positions(stopwords_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages); + existing_acronym_phrase_positions(existing_acronyms_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages); + uint32_t *stopwords = stopwords_array->a; + uint32_t *existing_acronyms = existing_acronyms_array->a; size_t num_tokens = token_array->n; token_t *tokens = token_array->a; @@ -392,16 +399,40 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal bool last_was_stopword = false; bool last_was_punctuation = false; + bool any_existing_acronyms = false; + char_array *temp_norm_token = NULL; + for (size_t j = 0; j < num_tokens; j++) { + if (existing_acronyms[j] > 0) { + any_existing_acronyms = true; + temp_norm_token = char_array_new(); + break; + } + } + for (size_t j = 0; j < num_tokens; j++) { token_t token = tokens[j]; // Make sure it's a non-ideographic word token - if (is_word_token(token.type) && !is_ideographic(token.type)) { + if (!is_ideographic(token.type)) { uint8_t *ptr = (uint8_t *)normalized; int32_t ch = 0; ssize_t ch_len = utf8proc_iterate(ptr + token.offset, token.len, &ch); if (ch_len > 0 && utf8_is_letter(utf8proc_category(ch))) { bool is_stopword = stopwords[j] == 1; + if (existing_acronyms[j] > 0 && !is_stopword) { + uint64_t token_options = NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS; + add_normalized_token(temp_norm_token, normalized, token, token_options); + char *norm_acronym_token = char_array_get_string(temp_norm_token); + + char_array_cat(acronym_with_stopwords, norm_acronym_token); + char_array_cat(acronym_no_stopwords, norm_acronym_token); + char_array_cat(sub_acronym_with_stopwords, norm_acronym_token); + char_array_cat(sub_acronym_no_stopwords, norm_acronym_token); + last_was_stopword = false; + last_was_punctuation = false; + continue; + } + if (!is_stopword && !last_was_punctuation) { char_array_cat_len(acronym_with_stopwords, normalized + token.offset, ch_len); char_array_cat_len(acronym_no_stopwords, normalized + token.offset, ch_len); @@ -456,6 +487,10 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal last_was_punctuation = true; } } + + if (temp_norm_token != NULL) { + char_array_destroy(temp_norm_token); + } } free(normalized); @@ -486,7 +521,6 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal } - char_array_destroy(token_string_array); token_array_destroy(token_array); char_array_destroy(combined_words_no_whitespace); @@ -498,6 +532,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal cstring_array_destroy(ngrams); uint32_array_destroy(stopwords_array); + uint32_array_destroy(existing_acronyms_array); cstring_array_destroy(name_expansions);