From 6ba0403748a396b74a04cd36d3cacf1e09ce1dc1 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 10 Jan 2018 22:23:40 -0500 Subject: [PATCH] [dedupe] adding a near-dupe hash for acronyms both with and without stopwords. This will create basic acronyms for institutions like MoMA, UCLA, the NAACP, as well as human initials, etc. It also handles sub-acronyms, so when either at every other non-contiguous stopword (University of Texas at Austin) or punctuation (University of Texas, Austin), it cuts a new sub-acronym (so UT). All of the acronyms for Latin script use a double metaphone as well, so can potentially catch many cases. It does not handle all possible acronyms (e.g. where some of the letters are word-internal as in medical acronyms), but should do relatively well on many common variations. --- src/acronyms.c | 16 +++- src/acronyms.h | 2 + src/near_dupe.c | 236 ++++++++++++++++++++++++++++++++++++------------ 3 files changed, 191 insertions(+), 63 deletions(-) diff --git a/src/acronyms.c b/src/acronyms.c index 425b64f2..64a02746 100644 --- a/src/acronyms.c +++ b/src/acronyms.c @@ -1,9 +1,12 @@ #include "acronyms.h" -static uint32_array *stopword_tokens(const char *str, token_array *tokens, size_t num_languages, char **languages) { - size_t len = tokens->n; - uint32_array *stopwords_array = uint32_array_new_zeros(len); +bool stopword_positions(uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages) { + if (stopwords_array == NULL) return false; + if (stopwords_array->n != tokens->n) { + uint32_array_resize_fixed(stopwords_array, tokens->n); + } + uint32_array_zero(stopwords_array->a, stopwords_array->n); uint32_t *stopwords = stopwords_array->a; for (size_t l = 0; l < num_languages; l++) { @@ -25,9 +28,10 @@ static uint32_array *stopword_tokens(const char *str, token_array *tokens, size_ } } - return stopwords_array; + return true; } + phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages) { if (s1 == NULL || tokens1 == NULL || s2 == NULL || tokens2 == NULL) { return NULL; @@ -56,11 +60,13 @@ phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, con token_t *t1 = tokens1->a; token_t *t2 = tokens2->a; - uint32_array *stopwords_array = stopword_tokens(s2, tokens2, num_languages, languages); + uint32_array *stopwords_array = uint32_array_new_zeros(tokens2->n); if (stopwords_array == NULL) { return NULL; } + stopword_positions(stopwords_array, s2, tokens2, num_languages, languages); + uint32_t *stopwords = stopwords_array->a; ssize_t acronym_start = -1; diff --git a/src/acronyms.h b/src/acronyms.h index 5c61002e..8b6a0dd9 100644 --- a/src/acronyms.h +++ b/src/acronyms.h @@ -9,6 +9,8 @@ #include "tokens.h" #include "token_types.h" +bool stopword_positions(uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages); + phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages); diff --git a/src/near_dupe.c b/src/near_dupe.c index 0b2efa7f..bed13db4 100644 --- a/src/near_dupe.c +++ b/src/near_dupe.c @@ -3,6 +3,8 @@ #include "log/log.h" #include "near_dupe.h" + +#include "acronyms.h" #include "double_metaphone.h" #include "expand.h" #include "features.h" @@ -211,6 +213,58 @@ static cstring_array *geohash_and_neighbors(double latitude, double longitude, s return NULL; } + +static inline bool add_string_to_array_if_unique(char *str, cstring_array *strings, khash_t(str_set) *unique_strings) { + khiter_t k = kh_get(str_set, unique_strings, str); + int ret = 0; + if (k == kh_end(unique_strings)) { + cstring_array_add_string(strings, str); + k = kh_put(str_set, unique_strings, strdup(str), &ret); + + if (ret < 0) { + return false; + } + return true; + } + return false; +} + + +static inline bool add_double_metaphone_to_array_if_unique(char *str, cstring_array *strings, khash_t(str_set) *unique_strings) { + if (str == NULL) return false; + double_metaphone_codes_t *dm_codes = double_metaphone(str); + if (dm_codes == NULL) { + return false; + } + char *dm_primary = dm_codes->primary; + char *dm_secondary = dm_codes->secondary; + + if (!string_equals(dm_primary, "")) { + add_string_to_array_if_unique(dm_primary, strings, unique_strings); + + if (!string_equals(dm_secondary, dm_primary)) { + add_string_to_array_if_unique(dm_secondary, strings, unique_strings); + } + } + double_metaphone_codes_destroy(dm_codes); + + return true; +} + +static inline bool add_double_metaphone_or_token_if_unique(char *str, cstring_array *strings, khash_t(str_set) *unique_strings) { + if (str == NULL) return false; + size_t len = strlen(str); + string_script_t token_script = get_string_script(str, len); + bool is_latin = token_script.len == len && token_script.script == SCRIPT_LATIN; + + if (is_latin) { + return add_double_metaphone_to_array_if_unique(str, strings, unique_strings); + } else { + return add_string_to_array_if_unique(str, strings, unique_strings); + } +} + + #define MAX_NAME_TOKENS 50 @@ -229,16 +283,22 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal cstring_array *strings = cstring_array_new_size(len); token_array *token_array = token_array_new(); + uint32_array *stopwords_array = uint32_array_new(); + char_array *combined_words_no_whitespace = char_array_new(); + char_array *acronym_with_stopwords = char_array_new(); + char_array *acronym_no_stopwords = char_array_new(); + char_array *sub_acronym_with_stopwords = char_array_new(); + char_array *sub_acronym_no_stopwords = char_array_new(); + khash_t(str_set) *unique_strings = kh_init(str_set); - khiter_t k; - int ret = 0; + bool keep_whitespace = false; for (size_t i = 0; i < num_expansions; i++) { char *expansion = cstring_array_get_string(name_expansions, i); log_debug("expansion = %s\n", expansion); - bool keep_whitespace = false; + token_array_clear(token_array); tokenize_add_tokens(token_array, expansion, strlen(expansion), keep_whitespace); size_t num_tokens = token_array->n; token_t *tokens = token_array->a; @@ -270,80 +330,140 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal log_debug("token_str = %s\n", token_str); - double_metaphone_codes_t *dm_codes = double_metaphone(token_str); - if (dm_codes == NULL) { - prev_token = token; - continue; - } - char *dm_primary = dm_codes->primary; - char *dm_secondary = dm_codes->secondary; - - if (!string_equals(dm_primary, "")) { - - k = kh_get(str_set, unique_strings, dm_primary); - - if (k == kh_end(unique_strings) && kh_size(unique_strings) <= MAX_NAME_TOKENS) { - log_debug("adding dm_primary = %s\n", dm_primary); - cstring_array_add_string(strings, dm_primary); - k = kh_put(str_set, unique_strings, strdup(dm_primary), &ret); - if (ret < 0) { - break; - } - } - - if (!string_equals(dm_secondary, dm_primary)) { - - k = kh_get(str_set, unique_strings, dm_secondary); - - if (k == kh_end(unique_strings) && kh_size(unique_strings) <= MAX_NAME_TOKENS) { - log_debug("adding dm_secondary = %s\n", dm_secondary); - cstring_array_add_string(strings, dm_secondary); - k = kh_put(str_set, unique_strings, strdup(dm_secondary), &ret); - if (ret < 0) { - break; - } - } - } - } - double_metaphone_codes_destroy(dm_codes); + add_double_metaphone_to_array_if_unique(token_str, strings, unique_strings); // For non-Latin words (Arabic, Cyrllic, etc.) just add the word // For ideograms, we do two-character shingles, so only add the first character if the string has one token } else if (!ideogram || j > 0 || num_tokens == 1) { char_array_cat_len(token_string_array, expansion + token.offset, token.len); token_str = char_array_get_string(token_string_array); log_debug("token_str = %s\n", token_str); - k = kh_get(str_set, unique_strings, token_str); - if (k == kh_end(unique_strings)) { - cstring_array_add_string(strings, token_str); - k = kh_put(str_set, unique_strings, strdup(token_str), &ret); - if (ret < 0) { - break; - } - } + add_string_to_array_if_unique(token_str, strings, unique_strings); } prev_token = token; } - char *combined = char_array_get_string(combined_words_no_whitespace); - log_debug("combined = %s\n", combined); - k = kh_get(str_set, unique_strings, combined); + if (combined_words_no_whitespace->n > 0) { + char *combined = char_array_get_string(combined_words_no_whitespace); + add_string_to_array_if_unique(combined, strings, unique_strings); + } - if (k == kh_end(unique_strings)) { - cstring_array_add_string(strings, combined); - k = kh_put(str_set, unique_strings, strdup(combined), &ret); - if (ret < 0) { - break; + } + + token_array_clear(token_array); + char *normalized = libpostal_normalize_string(name, LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS); + char *acronym = NULL; + if (normalized != NULL) { + keep_whitespace = false; + tokenize_add_tokens(token_array, normalized, strlen(normalized), keep_whitespace); + stopword_positions(stopwords_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages); + uint32_t *stopwords = stopwords_array->a; + + size_t num_tokens = token_array->n; + token_t *tokens = token_array->a; + num_tokens = token_array->n; + + if (num_tokens > 1) { + size_t num_stopwords_encountered = 0; + bool last_was_stopword = false; + bool last_was_punctuation = false; + + for (size_t j = 0; j < num_tokens; j++) { + token_t token = tokens[j]; + // Make sure it's a non-ideographic word token + if (is_word_token(token.type) && !is_ideographic(token.type)) { + uint8_t *ptr = (uint8_t *)normalized; + int32_t ch = 0; + ssize_t ch_len = utf8proc_iterate(ptr + token.offset, token.len, &ch); + if (ch_len > 0 && utf8_is_letter(utf8proc_category(ch))) { + bool is_stopword = stopwords[j] == 1; + + if (!is_stopword && !last_was_punctuation) { + char_array_cat_len(acronym_with_stopwords, normalized + token.offset, ch_len); + char_array_cat_len(acronym_no_stopwords, normalized + token.offset, ch_len); + + if (!(last_was_stopword && j == num_tokens - 1)) { + char_array_cat_len(sub_acronym_with_stopwords, normalized + token.offset, ch_len); + char_array_cat_len(sub_acronym_no_stopwords, normalized + token.offset, ch_len); + } + last_was_stopword = false; + } else { + if (!last_was_stopword && is_stopword) { + num_stopwords_encountered++; + } + + char_array_cat_len(acronym_with_stopwords, normalized + token.offset, ch_len); + if (!is_stopword) { + char_array_cat_len(acronym_no_stopwords, normalized + token.offset, ch_len); + } + + if ((num_stopwords_encountered % 2 == 0 || last_was_punctuation) && acronym_no_stopwords->n > 1) { + acronym = char_array_get_string(sub_acronym_with_stopwords); + log_debug("sub acronym stopwords = %s\n", acronym); + + char_array_clear(sub_acronym_with_stopwords); + + add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings); + + acronym = char_array_get_string(sub_acronym_no_stopwords); + log_debug("sub acronym no stopwords = %s\n", acronym); + add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings); + char_array_clear(sub_acronym_no_stopwords); + } else if (!((last_was_stopword || last_was_punctuation) && j == num_tokens - 1)) { + char_array_cat_len(sub_acronym_with_stopwords, normalized + token.offset, ch_len); + } + + last_was_stopword = is_stopword; + } + last_was_punctuation = false; + } + } else if (is_punctuation(token.type)) { + log_debug("punctuation\n"); + last_was_punctuation = true; + } } } - token_array_clear(token_array); + free(normalized); } + if (acronym_no_stopwords->n > 0) { + acronym = char_array_get_string(acronym_with_stopwords); + log_debug("acronym with stopwords = %s\n", acronym); + add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings); + } + + if (acronym_with_stopwords->n > 0) { + acronym = char_array_get_string(acronym_no_stopwords); + log_debug("acronym no stopwords = %s\n", acronym); + add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings); + + } + + if (sub_acronym_no_stopwords->n > 0) { + acronym = char_array_get_string(sub_acronym_with_stopwords); + log_debug("final sub acronym stopwords = %s\n", acronym); + add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings); + } + + if (sub_acronym_with_stopwords->n > 0) { + acronym = char_array_get_string(sub_acronym_no_stopwords); + log_debug("final sub acronym no stopwords = %s\n", acronym); + add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings); + } + + + char_array_destroy(token_string_array); token_array_destroy(token_array); char_array_destroy(combined_words_no_whitespace); + char_array_destroy(acronym_with_stopwords); + char_array_destroy(acronym_no_stopwords); + char_array_destroy(sub_acronym_with_stopwords); + char_array_destroy(sub_acronym_no_stopwords); + + uint32_array_destroy(stopwords_array); cstring_array_destroy(name_expansions); @@ -375,7 +495,7 @@ static inline void add_string_arrays_to_tree(string_tree_t *tree, size_t n, va_l static inline void add_hashes_from_tree(cstring_array *near_dupe_hashes, char *prefix, string_tree_t *tree) { string_tree_iterator_t *iter = string_tree_iterator_new(tree); if (iter->num_tokens > 0) { - log_debug("iter->num_tokens = %zu\n", iter->num_tokens); + log_debug("iter->num_tokens = %u\n", iter->num_tokens); for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) { @@ -407,7 +527,7 @@ static inline void add_string_hash_permutations(cstring_array *near_dupe_hashes, add_string_arrays_to_tree(tree, n, args); va_end(args); - log_debug("string_tree_num_strings(tree)=%zu\n", string_tree_num_strings(tree)); + log_debug("string_tree_num_strings(tree)=%u\n", string_tree_num_strings(tree)); add_hashes_from_tree(near_dupe_hashes, prefix, tree); }