[dedupe] adding a near-dupe hash for acronyms both with and without stopwords. This will create basic acronyms for institutions like MoMA, UCLA, the NAACP, as well as human initials, etc. It also handles sub-acronyms, so when either at every other non-contiguous stopword (University of Texas at Austin) or punctuation (University of Texas, Austin), it cuts a new sub-acronym (so UT). All of the acronyms for Latin script use a double metaphone as well, so can potentially catch many cases. It does not handle all possible acronyms (e.g. where some of the letters are word-internal as in medical acronyms), but should do relatively well on many common variations.

2018-01-10 22:23:40 -05:00
parent c29557c16b
commit 6ba0403748
3 changed files with 191 additions and 63 deletions
--- a/src/acronyms.c
+++ b/src/acronyms.c
@@ -1,9 +1,12 @@
 #include "acronyms.h"

-static uint32_array *stopword_tokens(const char *str, token_array *tokens, size_t num_languages, char **languages) {
-    size_t len = tokens->n;
-    uint32_array *stopwords_array = uint32_array_new_zeros(len);
+bool stopword_positions(uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages) {
+    if (stopwords_array == NULL) return false;
+    if (stopwords_array->n != tokens->n) {
+        uint32_array_resize_fixed(stopwords_array, tokens->n);
+    }

+    uint32_array_zero(stopwords_array->a, stopwords_array->n);
    uint32_t *stopwords = stopwords_array->a;

    for (size_t l = 0; l < num_languages; l++) {
@@ -25,9 +28,10 @@ static uint32_array *stopword_tokens(const char *str, token_array *tokens, size_
        }
    }

-    return stopwords_array;
+    return true;
 }

+
 phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages) {
    if (s1 == NULL || tokens1 == NULL || s2 == NULL || tokens2 == NULL) {
        return NULL;
@@ -56,11 +60,13 @@ phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, con
    token_t *t1 = tokens1->a;
    token_t *t2 = tokens2->a;

-    uint32_array *stopwords_array = stopword_tokens(s2, tokens2, num_languages, languages);
+    uint32_array *stopwords_array = uint32_array_new_zeros(tokens2->n);
    if (stopwords_array == NULL) {
        return NULL;
    }

+    stopword_positions(stopwords_array, s2, tokens2, num_languages, languages);
+
    uint32_t *stopwords = stopwords_array->a;

    ssize_t acronym_start = -1;
--- a/src/acronyms.h
+++ b/src/acronyms.h
@@ -9,6 +9,8 @@
 #include "tokens.h"
 #include "token_types.h"

+bool stopword_positions(uint32_array *stopwords_array, const char *str, token_array *tokens, size_t num_languages, char **languages);
+
 phrase_array *acronym_token_alignments(const char *s1, token_array *tokens1, const char *s2, token_array *tokens2, size_t num_languages, char **languages);


--- a/src/near_dupe.c
+++ b/src/near_dupe.c
@@ -3,6 +3,8 @@
 #include "log/log.h"

 #include "near_dupe.h"
+
+#include "acronyms.h"
 #include "double_metaphone.h"
 #include "expand.h"
 #include "features.h"
@@ -211,6 +213,58 @@ static cstring_array *geohash_and_neighbors(double latitude, double longitude, s
    return NULL;
 }

+
+static inline bool add_string_to_array_if_unique(char *str, cstring_array *strings, khash_t(str_set) *unique_strings) {
+    khiter_t k = kh_get(str_set, unique_strings, str);
+    int ret = 0;
+    if (k == kh_end(unique_strings)) {
+        cstring_array_add_string(strings, str);
+        k = kh_put(str_set, unique_strings, strdup(str), &ret);
+
+        if (ret < 0) {
+            return false;
+        }
+        return true;
+    }
+    return false;
+}
+
+
+static inline bool add_double_metaphone_to_array_if_unique(char *str, cstring_array *strings, khash_t(str_set) *unique_strings) {
+    if (str == NULL) return false;
+    double_metaphone_codes_t *dm_codes = double_metaphone(str);
+    if (dm_codes == NULL) {
+        return false;
+    }
+    char *dm_primary = dm_codes->primary;
+    char *dm_secondary = dm_codes->secondary;
+
+    if (!string_equals(dm_primary, "")) {
+        add_string_to_array_if_unique(dm_primary, strings, unique_strings);
+
+        if (!string_equals(dm_secondary, dm_primary)) {
+            add_string_to_array_if_unique(dm_secondary, strings, unique_strings);
+        }
+    }
+    double_metaphone_codes_destroy(dm_codes);
+
+    return true;
+}
+
+static inline bool add_double_metaphone_or_token_if_unique(char *str, cstring_array *strings, khash_t(str_set) *unique_strings) {
+    if (str == NULL) return false;
+    size_t len = strlen(str);
+    string_script_t token_script = get_string_script(str, len);
+    bool is_latin = token_script.len == len && token_script.script == SCRIPT_LATIN;
+
+    if (is_latin) {
+        return add_double_metaphone_to_array_if_unique(str, strings, unique_strings);
+    } else {
+        return add_string_to_array_if_unique(str, strings, unique_strings);
+    }
+}
+
+
 #define MAX_NAME_TOKENS 50


@@ -229,16 +283,22 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
    cstring_array *strings = cstring_array_new_size(len);
    token_array *token_array = token_array_new();

+    uint32_array *stopwords_array = uint32_array_new();
+
    char_array *combined_words_no_whitespace = char_array_new();

+    char_array *acronym_with_stopwords = char_array_new();
+    char_array *acronym_no_stopwords = char_array_new();
+    char_array *sub_acronym_with_stopwords = char_array_new();
+    char_array *sub_acronym_no_stopwords = char_array_new();
+
    khash_t(str_set) *unique_strings = kh_init(str_set);
-    khiter_t k;
-    int ret = 0;
+    bool keep_whitespace = false;

    for (size_t i = 0; i < num_expansions; i++) {
        char *expansion = cstring_array_get_string(name_expansions, i);
        log_debug("expansion = %s\n", expansion);
-        bool keep_whitespace = false;
+        token_array_clear(token_array);
        tokenize_add_tokens(token_array, expansion, strlen(expansion), keep_whitespace);
        size_t num_tokens = token_array->n;
        token_t *tokens = token_array->a;
@@ -270,80 +330,140 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal

                log_debug("token_str = %s\n", token_str);

-                double_metaphone_codes_t *dm_codes = double_metaphone(token_str);
-                if (dm_codes == NULL) {
-                    prev_token = token;
-                    continue;
-                }
-                char *dm_primary = dm_codes->primary;
-                char *dm_secondary = dm_codes->secondary;
-
-                if (!string_equals(dm_primary, "")) {
-
-                    k = kh_get(str_set, unique_strings, dm_primary);
-
-                    if (k == kh_end(unique_strings) && kh_size(unique_strings) <= MAX_NAME_TOKENS) {
-                        log_debug("adding dm_primary = %s\n", dm_primary);
-                        cstring_array_add_string(strings, dm_primary);
-                        k = kh_put(str_set, unique_strings, strdup(dm_primary), &ret);
-                        if (ret < 0) {
-                            break;
-                        }
-                    }
-
-                    if (!string_equals(dm_secondary, dm_primary)) {
-
-                        k = kh_get(str_set, unique_strings, dm_secondary);
-
-                        if (k == kh_end(unique_strings) && kh_size(unique_strings) <= MAX_NAME_TOKENS) {
-                            log_debug("adding dm_secondary = %s\n", dm_secondary);
-                            cstring_array_add_string(strings, dm_secondary);
-                            k = kh_put(str_set, unique_strings, strdup(dm_secondary), &ret);
-                            if (ret < 0) {
-                                break;
-                            }
-                        }
-                    }
-                }
-                double_metaphone_codes_destroy(dm_codes);
+                add_double_metaphone_to_array_if_unique(token_str, strings, unique_strings);
            // For non-Latin words (Arabic, Cyrllic, etc.) just add the word
            // For ideograms, we do two-character shingles, so only add the first character if the string has one token
            } else if (!ideogram || j > 0 || num_tokens == 1) {
                char_array_cat_len(token_string_array, expansion + token.offset, token.len);
                token_str = char_array_get_string(token_string_array);
                log_debug("token_str = %s\n", token_str);
-                k = kh_get(str_set, unique_strings, token_str);

-                if (k == kh_end(unique_strings)) {
-                    cstring_array_add_string(strings, token_str);
-                    k = kh_put(str_set, unique_strings, strdup(token_str), &ret);
-                    if (ret < 0) {
-                        break;
-                    }
-                }
+                add_string_to_array_if_unique(token_str, strings, unique_strings);
            }

            prev_token = token;
        }

-        char *combined = char_array_get_string(combined_words_no_whitespace);
-        log_debug("combined = %s\n", combined);
-        k = kh_get(str_set, unique_strings, combined);
+        if (combined_words_no_whitespace->n > 0) {
+            char *combined = char_array_get_string(combined_words_no_whitespace);
+            add_string_to_array_if_unique(combined, strings, unique_strings);
+        }

-        if (k == kh_end(unique_strings)) {
-            cstring_array_add_string(strings, combined);
-            k = kh_put(str_set, unique_strings, strdup(combined), &ret);
-            if (ret < 0) {
-                break;
+    }
+
+    token_array_clear(token_array);
+    char *normalized = libpostal_normalize_string(name, LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS);
+    char *acronym = NULL;
+    if (normalized != NULL) {
+        keep_whitespace = false;
+        tokenize_add_tokens(token_array, normalized, strlen(normalized), keep_whitespace);
+        stopword_positions(stopwords_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages);
+        uint32_t *stopwords = stopwords_array->a;
+
+        size_t num_tokens = token_array->n;
+        token_t *tokens = token_array->a;
+        num_tokens = token_array->n;
+
+        if (num_tokens > 1) {
+            size_t num_stopwords_encountered = 0;
+            bool last_was_stopword = false;
+            bool last_was_punctuation = false;
+
+            for (size_t j = 0; j < num_tokens; j++) {
+                token_t token = tokens[j];
+                // Make sure it's a non-ideographic word token
+                if (is_word_token(token.type) && !is_ideographic(token.type)) {
+                    uint8_t *ptr = (uint8_t *)normalized;
+                    int32_t ch = 0;
+                    ssize_t ch_len = utf8proc_iterate(ptr + token.offset, token.len, &ch);
+                    if (ch_len > 0 && utf8_is_letter(utf8proc_category(ch))) {
+                        bool is_stopword = stopwords[j] == 1;
+
+                        if (!is_stopword && !last_was_punctuation) {
+                            char_array_cat_len(acronym_with_stopwords, normalized + token.offset, ch_len);
+                            char_array_cat_len(acronym_no_stopwords, normalized + token.offset, ch_len);
+
+                            if (!(last_was_stopword && j == num_tokens - 1)) {
+                                char_array_cat_len(sub_acronym_with_stopwords, normalized + token.offset, ch_len);
+                                char_array_cat_len(sub_acronym_no_stopwords, normalized + token.offset, ch_len);
+                            }
+                            last_was_stopword = false;
+                        } else {
+                            if (!last_was_stopword && is_stopword) {
+                                num_stopwords_encountered++;
+                            }
+
+                            char_array_cat_len(acronym_with_stopwords, normalized + token.offset, ch_len);
+                            if (!is_stopword) {
+                                char_array_cat_len(acronym_no_stopwords, normalized + token.offset, ch_len);
+                            }
+
+                            if ((num_stopwords_encountered % 2 == 0 || last_was_punctuation) && acronym_no_stopwords->n > 1) {
+                                acronym = char_array_get_string(sub_acronym_with_stopwords);
+                                log_debug("sub acronym stopwords = %s\n", acronym);
+
+                                char_array_clear(sub_acronym_with_stopwords);
+
+                                add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings);
+
+                                acronym = char_array_get_string(sub_acronym_no_stopwords);
+                                log_debug("sub acronym no stopwords = %s\n", acronym);
+                                add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings);
+                                char_array_clear(sub_acronym_no_stopwords);
+                            } else if (!((last_was_stopword || last_was_punctuation) && j == num_tokens - 1)) {
+                                char_array_cat_len(sub_acronym_with_stopwords, normalized + token.offset, ch_len);
+                            }
+
+                            last_was_stopword = is_stopword;
+                        }
+                        last_was_punctuation = false;
+                    } 
+                } else if (is_punctuation(token.type)) {
+                    log_debug("punctuation\n");
+                    last_was_punctuation = true;
+                }
            }
        }

-        token_array_clear(token_array);
+        free(normalized);
    }

+    if (acronym_no_stopwords->n > 0) {
+        acronym = char_array_get_string(acronym_with_stopwords);
+        log_debug("acronym with stopwords = %s\n", acronym);
+        add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings);
+    }
+
+    if (acronym_with_stopwords->n > 0) {
+        acronym = char_array_get_string(acronym_no_stopwords);
+        log_debug("acronym no stopwords = %s\n", acronym);
+        add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings);
+
+    }
+
+    if (sub_acronym_no_stopwords->n > 0) {
+        acronym = char_array_get_string(sub_acronym_with_stopwords);
+        log_debug("final sub acronym stopwords = %s\n", acronym);
+        add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings);
+    }
+
+    if (sub_acronym_with_stopwords->n > 0) {
+        acronym = char_array_get_string(sub_acronym_no_stopwords);
+        log_debug("final sub acronym no stopwords = %s\n", acronym);
+        add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings);
+    }
+
+
+
    char_array_destroy(token_string_array);
    token_array_destroy(token_array);
    char_array_destroy(combined_words_no_whitespace);
+    char_array_destroy(acronym_with_stopwords);
+    char_array_destroy(acronym_no_stopwords);
+    char_array_destroy(sub_acronym_with_stopwords);
+    char_array_destroy(sub_acronym_no_stopwords);
+
+    uint32_array_destroy(stopwords_array);

    cstring_array_destroy(name_expansions);

@@ -375,7 +495,7 @@ static inline void add_string_arrays_to_tree(string_tree_t *tree, size_t n, va_l
 static inline void add_hashes_from_tree(cstring_array *near_dupe_hashes, char *prefix, string_tree_t *tree) {
    string_tree_iterator_t *iter = string_tree_iterator_new(tree);
    if (iter->num_tokens > 0) {
-        log_debug("iter->num_tokens = %zu\n", iter->num_tokens);
+        log_debug("iter->num_tokens = %u\n", iter->num_tokens);

        for (; !string_tree_iterator_done(iter); string_tree_iterator_next(iter)) {

@@ -407,7 +527,7 @@ static inline void add_string_hash_permutations(cstring_array *near_dupe_hashes,
    add_string_arrays_to_tree(tree, n, args);
    va_end(args);

-    log_debug("string_tree_num_strings(tree)=%zu\n", string_tree_num_strings(tree));
+    log_debug("string_tree_num_strings(tree)=%u\n", string_tree_num_strings(tree));

    add_hashes_from_tree(near_dupe_hashes, prefix, tree);
 }