From 03e5e25240a1f6f023d77c7c251eb437a4e8a967 Mon Sep 17 00:00:00 2001
From: Al <albarrentine@gmail.com>
Date: Tue, 16 Jan 2018 03:07:32 -0500
Subject: [PATCH] [dedupe] adding a near-dupe hash which takes into account
 existing acronyms which may have appeared in the string, either known
 acronyms as defined in the dictionaries like "HS" and includes the full token
 in the acronym. This feature is particularly useful for public schools or
 other cases where the canonical string may be used i.e. "Foo High School",
 "Foo HS" and "FHS". It also does the same thing other acronyms that are
 identified by the tokenizer from the internal period structure like A.B.C.
 Also now allowing mixed alpha-numeric tokens to use the double metaphone
 encoding as well, and for numeric tokens with script=Common (digits but may
 also contain hyphens, etc.), the full token is included as one of the words
 rather than quadgrams, which don't make sense for numerics.

---
 src/near_dupe.c | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/src/near_dupe.c b/src/near_dupe.c
index 081e6aa6..7f54cc1e 100644
--- a/src/near_dupe.c
+++ b/src/near_dupe.c
@@ -9,6 +9,7 @@
 #include "expand.h"
 #include "features.h"
 #include "float_utils.h"
+#include "normalize.h"
 #include "ngrams.h"
 #include "place.h"
 #include "scanner.h"
@@ -183,7 +184,6 @@ static inline cstring_array *expanded_component_root_with_fallback(char *input,
     }
 }
 
-
 static cstring_array *geohash_and_neighbors(double latitude, double longitude, size_t geohash_precision) {
     if (geohash_precision == 0) return NULL;
 
@@ -301,6 +301,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
     token_array *token_array = token_array_new();
 
     uint32_array *stopwords_array = uint32_array_new();
+    uint32_array *existing_acronyms_array = uint32_array_new();
 
     char_array *combined_words_no_whitespace = char_array_new();
 
@@ -332,6 +333,9 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
             string_script_t token_script = get_string_script(expansion + token.offset, token.len);
             bool is_latin = token_script.len == token.len && token_script.script == SCRIPT_LATIN;
 
+            bool is_common_script = token_script.len == token.len && token_script.script == SCRIPT_COMMON;
+            bool is_numeric = is_numeric_token(token.type);
+
             char_array_clear(token_string_array);
             // For ideograms, since the "words" are characters, we use shingles of two characters
             if (ideogram && j > 0 && is_ideographic(prev_token.type)) {
@@ -342,7 +346,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
             char_array_cat_len(combined_words_no_whitespace, expansion + token.offset, token.len);
 
             // For Latin script, add double metaphone of the words
-            if (is_latin && !is_numeric_token(token.type) && !ideogram && !is_punctuation(token.type)) {
+            if (is_latin && !(is_numeric && is_common_script) && !ideogram && !is_punctuation(token.type)) {
                 char_array_clear(token_string_array);
                 char_array_cat_len(token_string_array, expansion + token.offset, token.len);
                 token_str = char_array_get_string(token_string_array);
@@ -357,7 +361,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
                 token_str = char_array_get_string(token_string_array);
                 log_debug("token_str = %s\n", token_str);
 
-                if (!ideogram) {
+                if (!ideogram && !(is_numeric || is_common_script)) {
                     add_quadgrams_or_string_to_array_if_unique(token_str, strings, unique_strings, ngrams);
                 } else {
                     add_string_to_array_if_unique(token_str, strings, unique_strings);
@@ -381,7 +385,10 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
         keep_whitespace = false;
         tokenize_add_tokens(token_array, normalized, strlen(normalized), keep_whitespace);
         stopword_positions(stopwords_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages);
+        existing_acronym_phrase_positions(existing_acronyms_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages);
+
         uint32_t *stopwords = stopwords_array->a;
+        uint32_t *existing_acronyms = existing_acronyms_array->a;
 
         size_t num_tokens = token_array->n;
         token_t *tokens = token_array->a;
@@ -392,16 +399,40 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
             bool last_was_stopword = false;
             bool last_was_punctuation = false;
 
+            bool any_existing_acronyms = false;
+            char_array *temp_norm_token = NULL;
+            for (size_t j = 0; j < num_tokens; j++) {
+                if (existing_acronyms[j] > 0) {
+                    any_existing_acronyms = true;
+                    temp_norm_token = char_array_new();
+                    break;
+                }
+            }
+
             for (size_t j = 0; j < num_tokens; j++) {
                 token_t token = tokens[j];
                 // Make sure it's a non-ideographic word token
-                if (is_word_token(token.type) && !is_ideographic(token.type)) {
+                if (!is_ideographic(token.type)) {
                     uint8_t *ptr = (uint8_t *)normalized;
                     int32_t ch = 0;
                     ssize_t ch_len = utf8proc_iterate(ptr + token.offset, token.len, &ch);
                     if (ch_len > 0 && utf8_is_letter(utf8proc_category(ch))) {
                         bool is_stopword = stopwords[j] == 1;
 
+                        if (existing_acronyms[j] > 0 && !is_stopword) {
+                            uint64_t token_options = NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS;
+                            add_normalized_token(temp_norm_token, normalized, token, token_options);
+                            char *norm_acronym_token = char_array_get_string(temp_norm_token);
+
+                            char_array_cat(acronym_with_stopwords, norm_acronym_token);
+                            char_array_cat(acronym_no_stopwords, norm_acronym_token);
+                            char_array_cat(sub_acronym_with_stopwords, norm_acronym_token);
+                            char_array_cat(sub_acronym_no_stopwords, norm_acronym_token);
+                            last_was_stopword = false;
+                            last_was_punctuation = false;
+                            continue;
+                        }
+
                         if (!is_stopword && !last_was_punctuation) {
                             char_array_cat_len(acronym_with_stopwords, normalized + token.offset, ch_len);
                             char_array_cat_len(acronym_no_stopwords, normalized + token.offset, ch_len);
@@ -456,6 +487,10 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
                     last_was_punctuation = true;
                 }
             }
+
+            if (temp_norm_token != NULL) {
+                char_array_destroy(temp_norm_token);
+            }
         }
 
         free(normalized);
@@ -486,7 +521,6 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
     }
 
 
-
     char_array_destroy(token_string_array);
     token_array_destroy(token_array);
     char_array_destroy(combined_words_no_whitespace);
@@ -498,6 +532,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
     cstring_array_destroy(ngrams);
 
     uint32_array_destroy(stopwords_array);
+    uint32_array_destroy(existing_acronyms_array);
 
     cstring_array_destroy(name_expansions);