[fix] logic in sub-acronym generation for near-dupe hashes

This commit is contained in:
Al
2018-01-11 13:15:19 -05:00
parent 6ba0403748
commit f5e41a1f57

View File

@@ -383,7 +383,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
char_array_cat_len(acronym_with_stopwords, normalized + token.offset, ch_len);
char_array_cat_len(acronym_no_stopwords, normalized + token.offset, ch_len);
if (!(last_was_stopword && j == num_tokens - 1)) {
if (!(last_was_stopword && sub_acronym_no_stopwords->n == 0 && j == num_tokens - 1)) {
char_array_cat_len(sub_acronym_with_stopwords, normalized + token.offset, ch_len);
char_array_cat_len(sub_acronym_no_stopwords, normalized + token.offset, ch_len);
}
@@ -398,7 +398,8 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
char_array_cat_len(acronym_no_stopwords, normalized + token.offset, ch_len);
}
if ((num_stopwords_encountered % 2 == 0 || last_was_punctuation) && acronym_no_stopwords->n > 1) {
if ((num_stopwords_encountered % 2 == 0 || last_was_punctuation)) {
if (sub_acronym_no_stopwords->n > 1) {
acronym = char_array_get_string(sub_acronym_with_stopwords);
log_debug("sub acronym stopwords = %s\n", acronym);
@@ -410,8 +411,17 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
log_debug("sub acronym no stopwords = %s\n", acronym);
add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings);
char_array_clear(sub_acronym_no_stopwords);
} else if (!((last_was_stopword || last_was_punctuation) && j == num_tokens - 1)) {
} else if (!(last_was_stopword || last_was_punctuation) && j == num_tokens - 1) {
char_array_cat_len(sub_acronym_with_stopwords, normalized + token.offset, ch_len);
if (!is_stopword) {
char_array_cat_len(sub_acronym_no_stopwords, normalized + token.offset, ch_len);
}
}
} else {
char_array_cat_len(sub_acronym_with_stopwords, normalized + token.offset, ch_len);
if (!is_stopword) {
char_array_cat_len(sub_acronym_no_stopwords, normalized + token.offset, ch_len);
}
}
last_was_stopword = is_stopword;
@@ -438,7 +448,6 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal
acronym = char_array_get_string(acronym_no_stopwords);
log_debug("acronym no stopwords = %s\n", acronym);
add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings);
}
if (sub_acronym_no_stopwords->n > 0) {