[dedupe] account for missing ordinal suffixes in Soft-TFIDF deduping i.e. to count 1st Place and 1 Plce as the same where there might be a misspelling and the phrase wouldn't match under exact expansions
This commit is contained in:
21
src/dedupe.c
21
src/dedupe.c
@@ -365,6 +365,9 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char *
|
||||
|
||||
bool is_ideographic = have_ideographic_word_tokens(token_array1) && have_ideographic_word_tokens(token_array2);
|
||||
|
||||
uint32_array *ordinal_suffixes1 = uint32_array_new_size(num_tokens1);
|
||||
uint32_array *ordinal_suffixes2 = uint32_array_new_size(num_tokens2);
|
||||
|
||||
size_t min_len = num_tokens1 < num_tokens2 ? num_tokens1 : num_tokens2;
|
||||
size_t num_matches = 0;
|
||||
|
||||
@@ -386,9 +389,15 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char *
|
||||
search_address_dictionaries_tokens_with_phrases(joined1, token_array1, lang, &phrases1);
|
||||
search_address_dictionaries_tokens_with_phrases(joined2, token_array2, lang, &phrases2);
|
||||
|
||||
uint32_array_clear(ordinal_suffixes1);
|
||||
uint32_array_clear(ordinal_suffixes2);
|
||||
|
||||
add_ordinal_suffix_lengths(ordinal_suffixes1, joined1, token_array1, lang);
|
||||
add_ordinal_suffix_lengths(ordinal_suffixes2, joined2, token_array2, lang);
|
||||
|
||||
size_t matches_i = 0;
|
||||
|
||||
double sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, num_tokens2, tokens2, token_scores2, phrases2, acronym_alignments, multi_word_alignments, soft_tfidf_options, &matches_i);
|
||||
double sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, ordinal_suffixes1, num_tokens2, tokens2, token_scores2, phrases2, ordinal_suffixes2, acronym_alignments, multi_word_alignments, soft_tfidf_options, &matches_i);
|
||||
if (sim > max_sim) {
|
||||
max_sim = sim;
|
||||
}
|
||||
@@ -398,7 +407,7 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char *
|
||||
}
|
||||
}
|
||||
} else if (do_acronyms || multi_word_alignments != NULL) {
|
||||
max_sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, num_tokens2, tokens2, token_scores2, phrases2, acronym_alignments, multi_word_alignments, soft_tfidf_options, &num_matches);
|
||||
max_sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, NULL, num_tokens2, tokens2, token_scores2, phrases2, NULL, acronym_alignments, multi_word_alignments, soft_tfidf_options, &num_matches);
|
||||
} else {
|
||||
max_sim = soft_tfidf_similarity(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, soft_tfidf_options, &num_matches);
|
||||
}
|
||||
@@ -439,6 +448,14 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char *
|
||||
phrase_array_destroy(phrases2);
|
||||
}
|
||||
|
||||
if (ordinal_suffixes1 != NULL) {
|
||||
uint32_array_destroy(ordinal_suffixes1);
|
||||
}
|
||||
|
||||
if (ordinal_suffixes2 != NULL) {
|
||||
uint32_array_destroy(ordinal_suffixes2);
|
||||
}
|
||||
|
||||
if (acronym_alignments != NULL) {
|
||||
phrase_array_destroy(acronym_alignments);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user