[dedupe] adding a near-dupe hash which takes into account existing acronyms which may have appeared in the string, either known acronyms as defined in the dictionaries like "HS" and includes the full token in the acronym. This feature is particularly useful for public schools or other cases where the canonical string may be used i.e. "Foo High School", "Foo HS" and "FHS". It also does the same thing other acronyms that are identified by the tokenizer from the internal period structure like A.B.C. Also now allowing mixed alpha-numeric tokens to use the double metaphone encoding as well, and for numeric tokens with script=Common (digits but may also contain hyphens, etc.), the full token is included as one of the words rather than quadgrams, which don't make sense for numerics.
This commit is contained in:
@@ -9,6 +9,7 @@
|
|||||||
#include "expand.h"
|
#include "expand.h"
|
||||||
#include "features.h"
|
#include "features.h"
|
||||||
#include "float_utils.h"
|
#include "float_utils.h"
|
||||||
|
#include "normalize.h"
|
||||||
#include "ngrams.h"
|
#include "ngrams.h"
|
||||||
#include "place.h"
|
#include "place.h"
|
||||||
#include "scanner.h"
|
#include "scanner.h"
|
||||||
@@ -183,7 +184,6 @@ static inline cstring_array *expanded_component_root_with_fallback(char *input,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static cstring_array *geohash_and_neighbors(double latitude, double longitude, size_t geohash_precision) {
|
static cstring_array *geohash_and_neighbors(double latitude, double longitude, size_t geohash_precision) {
|
||||||
if (geohash_precision == 0) return NULL;
|
if (geohash_precision == 0) return NULL;
|
||||||
|
|
||||||
@@ -301,6 +301,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
|
|||||||
token_array *token_array = token_array_new();
|
token_array *token_array = token_array_new();
|
||||||
|
|
||||||
uint32_array *stopwords_array = uint32_array_new();
|
uint32_array *stopwords_array = uint32_array_new();
|
||||||
|
uint32_array *existing_acronyms_array = uint32_array_new();
|
||||||
|
|
||||||
char_array *combined_words_no_whitespace = char_array_new();
|
char_array *combined_words_no_whitespace = char_array_new();
|
||||||
|
|
||||||
@@ -332,6 +333,9 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
|
|||||||
string_script_t token_script = get_string_script(expansion + token.offset, token.len);
|
string_script_t token_script = get_string_script(expansion + token.offset, token.len);
|
||||||
bool is_latin = token_script.len == token.len && token_script.script == SCRIPT_LATIN;
|
bool is_latin = token_script.len == token.len && token_script.script == SCRIPT_LATIN;
|
||||||
|
|
||||||
|
bool is_common_script = token_script.len == token.len && token_script.script == SCRIPT_COMMON;
|
||||||
|
bool is_numeric = is_numeric_token(token.type);
|
||||||
|
|
||||||
char_array_clear(token_string_array);
|
char_array_clear(token_string_array);
|
||||||
// For ideograms, since the "words" are characters, we use shingles of two characters
|
// For ideograms, since the "words" are characters, we use shingles of two characters
|
||||||
if (ideogram && j > 0 && is_ideographic(prev_token.type)) {
|
if (ideogram && j > 0 && is_ideographic(prev_token.type)) {
|
||||||
@@ -342,7 +346,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
|
|||||||
char_array_cat_len(combined_words_no_whitespace, expansion + token.offset, token.len);
|
char_array_cat_len(combined_words_no_whitespace, expansion + token.offset, token.len);
|
||||||
|
|
||||||
// For Latin script, add double metaphone of the words
|
// For Latin script, add double metaphone of the words
|
||||||
if (is_latin && !is_numeric_token(token.type) && !ideogram && !is_punctuation(token.type)) {
|
if (is_latin && !(is_numeric && is_common_script) && !ideogram && !is_punctuation(token.type)) {
|
||||||
char_array_clear(token_string_array);
|
char_array_clear(token_string_array);
|
||||||
char_array_cat_len(token_string_array, expansion + token.offset, token.len);
|
char_array_cat_len(token_string_array, expansion + token.offset, token.len);
|
||||||
token_str = char_array_get_string(token_string_array);
|
token_str = char_array_get_string(token_string_array);
|
||||||
@@ -357,7 +361,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
|
|||||||
token_str = char_array_get_string(token_string_array);
|
token_str = char_array_get_string(token_string_array);
|
||||||
log_debug("token_str = %s\n", token_str);
|
log_debug("token_str = %s\n", token_str);
|
||||||
|
|
||||||
if (!ideogram) {
|
if (!ideogram && !(is_numeric || is_common_script)) {
|
||||||
add_quadgrams_or_string_to_array_if_unique(token_str, strings, unique_strings, ngrams);
|
add_quadgrams_or_string_to_array_if_unique(token_str, strings, unique_strings, ngrams);
|
||||||
} else {
|
} else {
|
||||||
add_string_to_array_if_unique(token_str, strings, unique_strings);
|
add_string_to_array_if_unique(token_str, strings, unique_strings);
|
||||||
@@ -381,7 +385,10 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
|
|||||||
keep_whitespace = false;
|
keep_whitespace = false;
|
||||||
tokenize_add_tokens(token_array, normalized, strlen(normalized), keep_whitespace);
|
tokenize_add_tokens(token_array, normalized, strlen(normalized), keep_whitespace);
|
||||||
stopword_positions(stopwords_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages);
|
stopword_positions(stopwords_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages);
|
||||||
|
existing_acronym_phrase_positions(existing_acronyms_array, (const char *)normalized, token_array, normalize_options.num_languages, normalize_options.languages);
|
||||||
|
|
||||||
uint32_t *stopwords = stopwords_array->a;
|
uint32_t *stopwords = stopwords_array->a;
|
||||||
|
uint32_t *existing_acronyms = existing_acronyms_array->a;
|
||||||
|
|
||||||
size_t num_tokens = token_array->n;
|
size_t num_tokens = token_array->n;
|
||||||
token_t *tokens = token_array->a;
|
token_t *tokens = token_array->a;
|
||||||
@@ -392,16 +399,40 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
|
|||||||
bool last_was_stopword = false;
|
bool last_was_stopword = false;
|
||||||
bool last_was_punctuation = false;
|
bool last_was_punctuation = false;
|
||||||
|
|
||||||
|
bool any_existing_acronyms = false;
|
||||||
|
char_array *temp_norm_token = NULL;
|
||||||
|
for (size_t j = 0; j < num_tokens; j++) {
|
||||||
|
if (existing_acronyms[j] > 0) {
|
||||||
|
any_existing_acronyms = true;
|
||||||
|
temp_norm_token = char_array_new();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (size_t j = 0; j < num_tokens; j++) {
|
for (size_t j = 0; j < num_tokens; j++) {
|
||||||
token_t token = tokens[j];
|
token_t token = tokens[j];
|
||||||
// Make sure it's a non-ideographic word token
|
// Make sure it's a non-ideographic word token
|
||||||
if (is_word_token(token.type) && !is_ideographic(token.type)) {
|
if (!is_ideographic(token.type)) {
|
||||||
uint8_t *ptr = (uint8_t *)normalized;
|
uint8_t *ptr = (uint8_t *)normalized;
|
||||||
int32_t ch = 0;
|
int32_t ch = 0;
|
||||||
ssize_t ch_len = utf8proc_iterate(ptr + token.offset, token.len, &ch);
|
ssize_t ch_len = utf8proc_iterate(ptr + token.offset, token.len, &ch);
|
||||||
if (ch_len > 0 && utf8_is_letter(utf8proc_category(ch))) {
|
if (ch_len > 0 && utf8_is_letter(utf8proc_category(ch))) {
|
||||||
bool is_stopword = stopwords[j] == 1;
|
bool is_stopword = stopwords[j] == 1;
|
||||||
|
|
||||||
|
if (existing_acronyms[j] > 0 && !is_stopword) {
|
||||||
|
uint64_t token_options = NORMALIZE_TOKEN_DELETE_FINAL_PERIOD | NORMALIZE_TOKEN_DELETE_ACRONYM_PERIODS;
|
||||||
|
add_normalized_token(temp_norm_token, normalized, token, token_options);
|
||||||
|
char *norm_acronym_token = char_array_get_string(temp_norm_token);
|
||||||
|
|
||||||
|
char_array_cat(acronym_with_stopwords, norm_acronym_token);
|
||||||
|
char_array_cat(acronym_no_stopwords, norm_acronym_token);
|
||||||
|
char_array_cat(sub_acronym_with_stopwords, norm_acronym_token);
|
||||||
|
char_array_cat(sub_acronym_no_stopwords, norm_acronym_token);
|
||||||
|
last_was_stopword = false;
|
||||||
|
last_was_punctuation = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (!is_stopword && !last_was_punctuation) {
|
if (!is_stopword && !last_was_punctuation) {
|
||||||
char_array_cat_len(acronym_with_stopwords, normalized + token.offset, ch_len);
|
char_array_cat_len(acronym_with_stopwords, normalized + token.offset, ch_len);
|
||||||
char_array_cat_len(acronym_no_stopwords, normalized + token.offset, ch_len);
|
char_array_cat_len(acronym_no_stopwords, normalized + token.offset, ch_len);
|
||||||
@@ -456,6 +487,10 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
|
|||||||
last_was_punctuation = true;
|
last_was_punctuation = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (temp_norm_token != NULL) {
|
||||||
|
char_array_destroy(temp_norm_token);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
free(normalized);
|
free(normalized);
|
||||||
@@ -486,7 +521,6 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
char_array_destroy(token_string_array);
|
char_array_destroy(token_string_array);
|
||||||
token_array_destroy(token_array);
|
token_array_destroy(token_array);
|
||||||
char_array_destroy(combined_words_no_whitespace);
|
char_array_destroy(combined_words_no_whitespace);
|
||||||
@@ -498,6 +532,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
|
|||||||
cstring_array_destroy(ngrams);
|
cstring_array_destroy(ngrams);
|
||||||
|
|
||||||
uint32_array_destroy(stopwords_array);
|
uint32_array_destroy(stopwords_array);
|
||||||
|
uint32_array_destroy(existing_acronyms_array);
|
||||||
|
|
||||||
cstring_array_destroy(name_expansions);
|
cstring_array_destroy(name_expansions);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user