From 7121642c6229d90bd2f8c3f98ec15be0039d20e5 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 17 Jan 2018 18:56:21 -0500 Subject: [PATCH] [dedupe] fixing sub-acronym near-dupe hashes with punctuation, and making sure to add the current token after a new sub-acronym has been cut --- src/near_dupe.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/near_dupe.c b/src/near_dupe.c index 7f54cc1e..1ca39184 100644 --- a/src/near_dupe.c +++ b/src/near_dupe.c @@ -411,8 +411,9 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal for (size_t j = 0; j < num_tokens; j++) { token_t token = tokens[j]; + bool is_punct = is_punctuation(token.type); // Make sure it's a non-ideographic word token - if (!is_ideographic(token.type)) { + if (!is_ideographic(token.type) && !is_punct) { uint8_t *ptr = (uint8_t *)normalized; int32_t ch = 0; ssize_t ch_len = utf8proc_iterate(ptr + token.offset, token.len, &ch); @@ -457,14 +458,17 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal acronym = char_array_get_string(sub_acronym_with_stopwords); log_debug("sub acronym stopwords = %s\n", acronym); - char_array_clear(sub_acronym_with_stopwords); - add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings, ngrams); + char_array_clear(sub_acronym_with_stopwords); + char_array_cat_len(sub_acronym_with_stopwords, normalized + token.offset, ch_len); acronym = char_array_get_string(sub_acronym_no_stopwords); log_debug("sub acronym no stopwords = %s\n", acronym); add_double_metaphone_or_token_if_unique(acronym, strings, unique_strings, ngrams); char_array_clear(sub_acronym_no_stopwords); + if (!is_stopword) { + char_array_cat_len(sub_acronym_no_stopwords, normalized + token.offset, ch_len); + } } else if (!(last_was_stopword || last_was_punctuation) && j == num_tokens - 1) { char_array_cat_len(sub_acronym_with_stopwords, normalized + token.offset, ch_len); if (!is_stopword) { @@ -482,7 +486,7 @@ cstring_array *name_word_hashes(char *name, libpostal_normalize_options_t normal } last_was_punctuation = false; } - } else if (is_punctuation(token.type)) { + } else if (is_punct) { log_debug("punctuation\n"); last_was_punctuation = true; }