diff --git a/src/acronyms.c b/src/acronyms.c index 425b64f2..64a02746 100644 --- a/src/acronyms.c +++ b/src/acronyms.c @@ -1,9 +1,12 @@ #include "acronyms.h" -static uint32_array *stopword_tokens(const char *str, token_array *tokens, size_t num_languages, char **languages) { - size_t len = tokens->n; - uint32_array *stopwords_array = uint32_array_new_zeros(len); +bool stopword_positions(uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages) { + if (stopwords_array == NULL) return false; + if (stopwords_array->n != tokens->n) { + uint32_array_resize_fixed(stopwords_array, tokens->n); + } + uint32_array_zero(stopwords_array->a, stopwords_array->n); uint32_t *stopwords = stopwords_array->a; for (size_t l = 0; l < num_languages; l++) { @@ -25,9 +28,10 @@ static uint32_array *stopword_tokens(const char *str, token_array *tokens, size_ } } - return stopwords_array; + return true; } + phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages) { if (s1 == NULL || tokens1 == NULL || s2 == NULL || tokens2 == NULL) { return NULL; @@ -56,11 +60,13 @@ phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, con token_t *t1 = tokens1->a; token_t *t2 = tokens2->a; - uint32_array *stopwords_array = stopword_tokens(s2, tokens2, num_languages, languages); + uint32_array *stopwords_array = uint32_array_new_zeros(tokens2->n); if (stopwords_array == NULL) { return NULL; } + stopword_positions(stopwords_array, s2, tokens2, num_languages, languages); + uint32_t *stopwords = stopwords_array->a; ssize_t acronym_start = -1; diff --git a/src/acronyms.h b/src/acronyms.h index 5c61002e..8b6a0dd9 100644 --- a/src/acronyms.h +++ b/src/acronyms.h @@ -9,6 +9,8 @@ #include "tokens.h" #include "token_types.h" +bool stopword_positions(uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages); + phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages); diff --git a/src/near_dupe.c b/src/near_dupe.c index 0b2efa7f..bed13db4 100644 --- a/src/near_dupe.c +++ b/src/near_dupe.c @@ -3,6 +3,8 @@ #include "log/log.h" #include "near_dupe.h" + +#include "acronyms.h" #include "double_metaphone.h" #include "expand.h" #include "features.h" @@ -211,6 +213,58 @@ static cstring_array *geohash_and_neighbors(double latitude, double longitude, s return NULL; } + +static inline bool add_string_to_array_if_unique(char *str, cstring_array *strings, khash_t(str_set) *unique_strings) { + khiter_t k = kh_get(str_set, unique_strings, str); + int ret = 0; + if (k == kh_end(unique_strings)) { + cstring_array_add_string(strings, str); + k = kh_put(str_set, unique_strings, strdup(str), &ret); + + if (ret < 0) { + return false; + } + return true; + } + return false; +} + + +static inline bool add_double_metaphone_to_array_if_unique(char *str, cstring_array *strings, khash_t(str_set) *unique_strings) { + if (str == NULL) return false; + double_metaphone_codes_t *dm_codes = double_metaphone(str); + if (dm_codes == NULL) { + return false; + } + char *dm_primary = dm_codes->primary; + char *dm_secondary = dm_codes->secondary; + + if (!string_equals(dm_primary, "")) { + add_string_to_array_if_unique(dm_primary, strings, unique_strings); + + if (!string_equals(dm_secondary, dm_primary)) { + add_string_to_array_if_unique(dm_secondary, strings, unique_strings); + } + } + double_metaphone_codes_destroy(dm_codes); + + return true; +} + +static inline bool add_double_metaphone_or_token_if_unique(char *str, cstring_array *strings, khash_t(str_set) *unique_strings) { + if (str == NULL) return false; + size_t len = strlen(str); + string_script_t token_script = get_string_script(str, len); + bool is_latin = token_script.len == len && token_script.script == SCRIPT_LATIN; + + if (is_latin) { + return add_double_metaphone_to_array_if_unique(str, strings, unique_strings); + } else { + return add_string_to_array_if_unique(str, strings, unique_strings); + } +} + + #define MAX_NAME_TOKENS 50 @@ -229,16 +283,22 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal cstring_array *strings = cstring_array_new_size(len); token_array *token_array = token_array_new(); + uint32_array *stopwords_array = uint32_array_new(); + char_array *combined_words_no_whitespace = char_array_new(); + char_array *acronym_with_stopwords = char_array_new(); + char_array *acronym_no_stopwords = char_array_new(); + char_array *sub_acronym_with_stopwords = char_array_new(); + char_array *sub_acronym_no_stopwords = char_array_new(); + khash_t(str_set) *unique_strings = kh_init(str_set); - khiter_t k; - int ret = 0; + bool keep_whitespace = false; for (size_t i = 0; i < num_expansions; i++) { char *expansion = cstring_array_get_string(name_expansions, i); log_debug("expansion = %s\n", expansion); - bool keep_whitespace = false; + token_array_clear(token_array); tokenize_add_tokens(token_array, expansion, strlen(expansion), keep_whitespace); size_t num_tokens = token_array->n; token_t *tokens = token_array->a; @@ -270,80 +330,140 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal log_debug("token_str = %s\n", token_str); - double_metaphone_codes_t *dm_codes = double_metaphone(token_str); - if (dm_codes == NULL) { - prev_token = token; - continue; - } - char *dm_primary = dm_codes->primary; - char *dm_secondary = dm_codes->secondary; - - if (!string_equals(dm_primary, "")) { - - k = kh_get(str_set, unique_strings, dm_primary); - - if (k == kh_end(unique_strings) && kh_size(unique_strings) <= MAX_NAME_TOKENS) { - log_debug("adding dm_primary = %s\n", dm_primary); - cstring_array_add_string(strings, dm_primary); - k = kh_put(str_set, unique_strings, strdup(dm_primary), &ret); - if (ret < 0) { - break; - } - } - - if (!string_equals(dm_secondary, dm_primary)) { - - k = kh_get(str_set, unique_strings, dm_secondary); - - if (k == kh_end(unique_strings) && kh_size(unique_strings) <= MAX_NAME_TOKENS) { - log_debug("adding dm_secondary = %s\n", dm_secondary); - cstring_array_add_string(strings, dm_secondary); - k = kh_put(str_set, unique_strings, strdup(dm_secondary), &ret); - if (ret < 0) { - break; - } - } - } - } - double_metaphone_codes_destroy(dm_codes); + add_double_metaphone_to_array_if_unique(token_str, strings, unique_strings); // For non-Latin words (Arabic, Cyrllic, etc.) just add the word // For ideograms, we do two-character shingles, so only add the first character if the string has one token } else if (!ideogram || j > 0 || num_tokens == 1) { char_array_cat_len(token_string_array, expansion + token.offset, token.len); token_str = char_array_get_string(token_string_array); log_debug("token_str = %s\n", token_str); - k = kh_get(str_set, unique_strings, token_str); - if (k == kh_end(unique_strings)) { - cstring_array_add_string(strings, token_str); - k = kh_put(str_set, unique_strings, strdup(token_str), &ret); - if (ret < 0) { - break; - } - } + add_string_to_array_if_unique(token_str, strings, unique_strings); } prev_token = token; } - char *combined = char_array_get_string(combined_words_no_whitespace); - log_debug("combined = %s\n", combined); - k = kh_get(str_set, unique_strings, combined); + if (combined_words_no_whitespace->n > 0) { + char *combined = char_array_get_string(combined_words_no_whitespace); + add_string_to_array_if_unique(combined, strings, unique_strings); + } - if (k == kh_end(unique_strings)) { - cstring_array_add_string(strings, combined); - k = kh_put(str_set, unique_strings, strdup(combined), &ret); - if (ret < 0) { - break; + } + + token_array_clear(token_array); + char *normalized = libpostal_normalize_string(name, LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS); + char *acronym = NULL; + if (normalized != NULL) { + keep_whitespace = false; + tokenize_add_tokens(token_array, normalized, strlen(normalized), keep_whitespace); + stopword_positions(stopwords_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages); + uint32_t *stopwords = stopwords_array->a; + + size_t num_tokens = token_array->n; + token_t *tokens = token_array->a; + num_tokens = token_array->n; + + if (num_tokens > 1) { + size_t num_stopwords_encountered = 0; + bool last_was_stopword = false; + bool last_was_punctuation = false; + + for (size_t j = 0; j < num_tokens; j++) { + token_t token = tokens[j]; + // Make sure it's a non-ideographic word token + if (is_word_token(token.type) && !is_ideographic(token.type)) { + uint8_t *ptr = (uint8_t *)normalized; + int32_t ch = 0; + ssize_t ch_len = utf8proc_iterate(ptr + token.offset, token.len, &ch); + if (ch_len > 0 && utf8_is_letter(utf8proc_category(ch))) { + bool is_stopword = stopwords[j] == 1; + + if (!is_stopword && !last_was_punctuation) { + char_array_cat_len(acronym_with_stopwords, normalized + token.offset, ch_len); + char_array_cat_len(acronym_no_stopwords, normalized + token.offset, ch_len); + + if (!(last_was_stopword && j == num_tokens - 1)) { + char_array_cat_len(sub_acronym_with_stopwords, normalized + token.offset, ch_len); + char_array_cat_len(sub_acronym_no_stopwords, normalized + token.offset, ch_len); + } + last_was_stopword = false; + } else { + if (!last_was_stopword && is_stopword) { + num_stopwords_encountered++; + } + + char_array_cat_len(acronym_with_stopwords, normalized + token.offset, ch_len); + if (!is_stopword) { + char_array_cat_len(acronym_no_stopwords, normalized + token.offset, ch_len); + } + + if ((num_stopwords_encountered % 2 == 0 || last_was_punctuation) && acronym_no_stopwords->n > 1) { + acronym = char_array_get_string(sub_acronym_with_stopwords); + log_debug("sub acronym stopwords = %s\n", acronym); + + char_array_clear(sub_acronym_with_stopwords); + + add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings); + + acronym = char_array_get_string(sub_acronym_no_stopwords); + log_debug("sub acronym no stopwords = %s\n", acronym); + add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings); + char_array_clear(sub_acronym_no_stopwords); + } else if (!((last_was_stopword || last_was_punctuation) && j == num_tokens - 1)) { + char_array_cat_len(sub_acronym_with_stopwords, normalized + token.offset, ch_len); + } + + last_was_stopword = is_stopword; + } + last_was_punctuation = false; + } + } else if (is_punctuation(token.type)) { + log_debug("punctuation\n"); + last_was_punctuation = true; + } } } - token_array_clear(token_array); + free(normalized); } + if (acronym_no_stopwords->n > 0) { + acronym = char_array_get_string(acronym_with_stopwords); + log_debug("acronym with stopwords = %s\n", acronym); + add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings); + } + + if (acronym_with_stopwords->n > 0) { + acronym = char_array_get_string(acronym_no_stopwords); + log_debug("acronym no stopwords = %s\n", acronym); + add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings); + + } + + if (sub_acronym_no_stopwords->n > 0) { + acronym = char_array_get_string(sub_acronym_with_stopwords); + log_debug("final sub acronym stopwords = %s\n", acronym); + add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings); + } + + if (sub_acronym_with_stopwords->n > 0) { + acronym = char_array_get_string(sub_acronym_no_stopwords); + log_debug("final sub acronym no stopwords = %s\n", acronym); + add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings); + } + + + char_array_destroy(token_string_array); token_array_destroy(token_array); char_array_destroy(combined_words_no_whitespace); + char_array_destroy(acronym_with_stopwords); + char_array_destroy(acronym_no_stopwords); + char_array_destroy(sub_acronym_with_stopwords); + char_array_destroy(sub_acronym_no_stopwords); + + uint32_array_destroy(stopwords_array); cstring_array_destroy(name_expansions); @@ -375,7 +495,7 @@ static inline void add_string_arrays_to_tree(string_tree_t *tree, size_t n, va_l static inline void add_hashes_from_tree(cstring_array *near_dupe_hashes, char *prefix, string_tree_t *tree) { string_tree_iterator_t *iter = string_tree_iterator_new(tree); if (iter->num_tokens > 0) { - log_debug("iter->num_tokens = %zu\n", iter->num_tokens); + log_debug("iter->num_tokens = %u\n", iter->num_tokens); for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) { @@ -407,7 +527,7 @@ static inline void add_string_hash_permutations(cstring_array *near_dupe_hashes, add_string_arrays_to_tree(tree, n, args); va_end(args); - log_debug("string_tree_num_strings(tree)=%zu\n", string_tree_num_strings(tree)); + log_debug("string_tree_num_strings(tree)=%u\n", string_tree_num_strings(tree)); add_hashes_from_tree(near_dupe_hashes, prefix, tree); }