[dedupe] adding multi-word phrase alignments to deduping
This commit is contained in:
15
src/dedupe.c
15
src/dedupe.c
@@ -7,6 +7,7 @@
|
|||||||
#include "place.h"
|
#include "place.h"
|
||||||
#include "scanner.h"
|
#include "scanner.h"
|
||||||
#include "soft_tfidf.h"
|
#include "soft_tfidf.h"
|
||||||
|
#include "string_similarity.h"
|
||||||
#include "token_types.h"
|
#include "token_types.h"
|
||||||
|
|
||||||
bool expansions_intersect(cstring_array *expansions1, cstring_array *expansions2) {
|
bool expansions_intersect(cstring_array *expansions1, cstring_array *expansions2) {
|
||||||
@@ -357,7 +358,8 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char *
|
|||||||
char **languages = options.languages;
|
char **languages = options.languages;
|
||||||
|
|
||||||
phrase_array *acronym_alignments = NULL;
|
phrase_array *acronym_alignments = NULL;
|
||||||
|
phrase_array *multi_word_alignments = NULL;
|
||||||
|
|
||||||
phrase_array *phrases1 = NULL;
|
phrase_array *phrases1 = NULL;
|
||||||
phrase_array *phrases2 = NULL;
|
phrase_array *phrases2 = NULL;
|
||||||
|
|
||||||
@@ -370,6 +372,7 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char *
|
|||||||
if (do_acronyms) {
|
if (do_acronyms) {
|
||||||
acronym_alignments = acronym_token_alignments(joined1, token_array1, joined2, token_array2, num_languages, languages);
|
acronym_alignments = acronym_token_alignments(joined1, token_array1, joined2, token_array2, num_languages, languages);
|
||||||
}
|
}
|
||||||
|
multi_word_alignments = multi_word_token_alignments(joined1, token_array1, joined2, token_array2);
|
||||||
|
|
||||||
if (num_languages > 0) {
|
if (num_languages > 0) {
|
||||||
phrases1 = phrase_array_new();
|
phrases1 = phrase_array_new();
|
||||||
@@ -385,7 +388,7 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char *
|
|||||||
|
|
||||||
size_t matches_i = 0;
|
size_t matches_i = 0;
|
||||||
|
|
||||||
double sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, num_tokens2, tokens2, token_scores2, phrases2, acronym_alignments, soft_tfidf_options, &matches_i);
|
double sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, num_tokens2, tokens2, token_scores2, phrases2, acronym_alignments, multi_word_alignments, soft_tfidf_options, &matches_i);
|
||||||
if (sim > max_sim) {
|
if (sim > max_sim) {
|
||||||
max_sim = sim;
|
max_sim = sim;
|
||||||
}
|
}
|
||||||
@@ -394,8 +397,8 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char *
|
|||||||
num_matches = matches_i;
|
num_matches = matches_i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (do_acronyms) {
|
} else if (do_acronyms || multi_word_alignments != NULL) {
|
||||||
max_sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, num_tokens2, tokens2, token_scores2, phrases2, acronym_alignments, soft_tfidf_options, &num_matches);
|
max_sim = soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, phrases1, num_tokens2, tokens2, token_scores2, phrases2, acronym_alignments, multi_word_alignments, soft_tfidf_options, &num_matches);
|
||||||
} else {
|
} else {
|
||||||
max_sim = soft_tfidf_similarity(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, soft_tfidf_options, &num_matches);
|
max_sim = soft_tfidf_similarity(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, soft_tfidf_options, &num_matches);
|
||||||
}
|
}
|
||||||
@@ -440,6 +443,10 @@ libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char *
|
|||||||
phrase_array_destroy(acronym_alignments);
|
phrase_array_destroy(acronym_alignments);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (multi_word_alignments != NULL) {
|
||||||
|
phrase_array_destroy(multi_word_alignments);
|
||||||
|
}
|
||||||
|
|
||||||
if (token_array1 != NULL) {
|
if (token_array1 != NULL) {
|
||||||
token_array_destroy(token_array1);
|
token_array_destroy(token_array1);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -125,7 +125,7 @@ static inline size_t sum_token_lengths(size_t num_tokens, char **tokens) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, soft_tfidf_options_t options, size_t *num_matches) {
|
double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, phrase_array *multi_word_alignments, soft_tfidf_options_t options, size_t *num_matches) {
|
||||||
if (token_scores1 == NULL || token_scores2 == NULL) return 0.0;
|
if (token_scores1 == NULL || token_scores2 == NULL) return 0.0;
|
||||||
|
|
||||||
if (num_tokens1 > num_tokens2 || (num_tokens1 == num_tokens2 && sum_token_lengths(num_tokens1, tokens1) > sum_token_lengths(num_tokens2, tokens2))) {
|
if (num_tokens1 > num_tokens2 || (num_tokens1 == num_tokens2 && sum_token_lengths(num_tokens1, tokens1) > sum_token_lengths(num_tokens2, tokens2))) {
|
||||||
@@ -164,6 +164,9 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
int64_array *acronym_memberships_array = NULL;
|
int64_array *acronym_memberships_array = NULL;
|
||||||
int64_t *acronym_memberships = NULL;
|
int64_t *acronym_memberships = NULL;
|
||||||
|
|
||||||
|
int64_array *multi_word_memberships_array = NULL;
|
||||||
|
int64_t *multi_word_memberships = NULL;
|
||||||
|
|
||||||
t1_tokens_unicode = calloc(len1, sizeof(uint32_array *));
|
t1_tokens_unicode = calloc(len1, sizeof(uint32_array *));
|
||||||
if (t1_tokens_unicode == NULL) {
|
if (t1_tokens_unicode == NULL) {
|
||||||
total_sim = -1.0;
|
total_sim = -1.0;
|
||||||
@@ -217,6 +220,14 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (multi_word_alignments != NULL) {
|
||||||
|
multi_word_memberships_array = int64_array_new();
|
||||||
|
token_phrase_memberships(multi_word_alignments, multi_word_memberships_array, len2);
|
||||||
|
if (multi_word_memberships_array->n == len2) {
|
||||||
|
multi_word_memberships = multi_word_memberships_array->a;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
double jaro_winkler_min = options.jaro_winkler_min;
|
double jaro_winkler_min = options.jaro_winkler_min;
|
||||||
size_t jaro_winkler_min_length = options.jaro_winkler_min_length;
|
size_t jaro_winkler_min_length = options.jaro_winkler_min_length;
|
||||||
size_t damerau_levenshtein_max = options.damerau_levenshtein_max;
|
size_t damerau_levenshtein_max = options.damerau_levenshtein_max;
|
||||||
@@ -254,7 +265,10 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
int64_t pm1 = phrase_memberships1 != NULL ? phrase_memberships1[i] : NULL_PHRASE_MEMBERSHIP;
|
int64_t pm1 = phrase_memberships1 != NULL ? phrase_memberships1[i] : NULL_PHRASE_MEMBERSHIP;
|
||||||
phrase_t p1 = pm1 >= 0 ? phrases1->a[pm1] : NULL_PHRASE;
|
phrase_t p1 = pm1 >= 0 ? phrases1->a[pm1] : NULL_PHRASE;
|
||||||
phrase_t argmax_phrase = NULL_PHRASE;
|
phrase_t argmax_phrase = NULL_PHRASE;
|
||||||
|
|
||||||
|
bool have_multi_word_match = false;
|
||||||
|
phrase_t multi_word_phrase = NULL_PHRASE;
|
||||||
|
|
||||||
bool use_jaro_winkler = t1_len >= jaro_winkler_min_length;
|
bool use_jaro_winkler = t1_len >= jaro_winkler_min_length;
|
||||||
bool use_strict_abbreviation_sim = t1_len >= strict_abbreviation_min_length;
|
bool use_strict_abbreviation_sim = t1_len >= strict_abbreviation_min_length;
|
||||||
bool use_damerau_levenshtein = damerau_levenshtein_max > 0 && t1_len >= damerau_levenshtein_min_length;
|
bool use_damerau_levenshtein = damerau_levenshtein_max > 0 && t1_len >= damerau_levenshtein_min_length;
|
||||||
@@ -315,6 +329,23 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (multi_word_memberships != NULL) {
|
||||||
|
int64_t multi_word_membership = multi_word_memberships[j];
|
||||||
|
log_debug("multi_word_membership = %zd\n", multi_word_membership);
|
||||||
|
if (multi_word_membership >= 0) {
|
||||||
|
multi_word_phrase = multi_word_alignments->a[multi_word_membership];
|
||||||
|
uint32_t multi_word_match_index = multi_word_phrase.data;
|
||||||
|
if (multi_word_match_index == i) {
|
||||||
|
max_sim = 1.0;
|
||||||
|
argmax_sim = j;
|
||||||
|
have_multi_word_match = true;
|
||||||
|
log_debug("have multi-word match\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
double jaro_winkler = jaro_winkler_distance_unicode(t1u, t2u);
|
double jaro_winkler = jaro_winkler_distance_unicode(t1u, t2u);
|
||||||
if (jaro_winkler > max_sim) {
|
if (jaro_winkler > max_sim) {
|
||||||
max_sim = jaro_winkler;
|
max_sim = jaro_winkler;
|
||||||
@@ -349,7 +380,7 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
// Note: here edit distance, affine gap and abbreviations are only used in the thresholding process.
|
// Note: here edit distance, affine gap and abbreviations are only used in the thresholding process.
|
||||||
// Jaro-Winkler is still used to calculate similarity
|
// Jaro-Winkler is still used to calculate similarity
|
||||||
|
|
||||||
if (!have_acronym_match && !have_phrase_match) {
|
if (!have_acronym_match && !have_phrase_match && !have_multi_word_match) {
|
||||||
if (have_equal || (use_jaro_winkler && (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)))) {
|
if (have_equal || (use_jaro_winkler && (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)))) {
|
||||||
log_debug("jaro-winkler, max_sim = %f\n", max_sim);
|
log_debug("jaro-winkler, max_sim = %f\n", max_sim);
|
||||||
t2_score = token_scores2[argmax_sim];
|
t2_score = token_scores2[argmax_sim];
|
||||||
@@ -407,7 +438,33 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
|
|
||||||
matched_tokens += p1.len;
|
matched_tokens += p1.len;
|
||||||
log_debug("have_phrase_match\n");
|
log_debug("have_phrase_match\n");
|
||||||
} else {
|
} else if (have_multi_word_match) {
|
||||||
|
double multi_word_score = 0.0;
|
||||||
|
for (size_t p = multi_word_phrase.start; p < multi_word_phrase.start + multi_word_phrase.len; p++) {
|
||||||
|
t2_score = token_scores2[p];
|
||||||
|
multi_word_score += t2_score * t2_score;
|
||||||
|
}
|
||||||
|
|
||||||
|
double norm_multi_word_score = sqrt(multi_word_score);
|
||||||
|
|
||||||
|
double max_multi_word_score = 0.0;
|
||||||
|
if (t1_score > norm_multi_word_score || double_equals(t1_score, norm_multi_word_score)) {
|
||||||
|
norm2_offset += (t1_score * t1_score) - multi_word_score;
|
||||||
|
log_debug("t1_score >= norm_multi_word_score, norm2_offset = %f\n", norm2_offset);
|
||||||
|
max_multi_word_score = t1_score;
|
||||||
|
} else {
|
||||||
|
norm1_offset += multi_word_score - (t1_score * t1_score);
|
||||||
|
log_debug("norm_multi_word_score > t1_score, norm1_offset = %f\n", norm1_offset);
|
||||||
|
max_multi_word_score = norm_multi_word_score;
|
||||||
|
}
|
||||||
|
|
||||||
|
log_debug("max_multi_word_score = %f\n", max_multi_word_score);
|
||||||
|
|
||||||
|
total_sim += max_multi_word_score * max_multi_word_score;
|
||||||
|
|
||||||
|
log_debug("have multi-word match\n");
|
||||||
|
matched_tokens++;
|
||||||
|
} else if (have_acronym_match) {
|
||||||
double acronym_score = 0.0;
|
double acronym_score = 0.0;
|
||||||
for (size_t p = acronym_phrase.start; p < acronym_phrase.start + acronym_phrase.len; p++) {
|
for (size_t p = acronym_phrase.start; p < acronym_phrase.start + acronym_phrase.len; p++) {
|
||||||
t2_score = token_scores2[p];
|
t2_score = token_scores2[p];
|
||||||
@@ -475,6 +532,10 @@ return_soft_tfidf_score:
|
|||||||
int64_array_destroy(acronym_memberships_array);
|
int64_array_destroy(acronym_memberships_array);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (multi_word_memberships_array != NULL) {
|
||||||
|
int64_array_destroy(multi_word_memberships_array);
|
||||||
|
}
|
||||||
|
|
||||||
double norm = sqrt(double_array_sum_sq(token_scores1, num_tokens1) + norm1_offset) * sqrt(double_array_sum_sq(token_scores2, num_tokens2) + norm2_offset);
|
double norm = sqrt(double_array_sum_sq(token_scores1, num_tokens1) + norm1_offset) * sqrt(double_array_sum_sq(token_scores2, num_tokens2) + norm2_offset);
|
||||||
log_debug("total_sim = %f, norm1_offset = %f, norm2_offset = %f, norm = %f\n", total_sim, norm1_offset, norm2_offset, norm);
|
log_debug("total_sim = %f, norm1_offset = %f, norm2_offset = %f, norm = %f\n", total_sim, norm1_offset, norm2_offset, norm);
|
||||||
|
|
||||||
@@ -484,5 +545,5 @@ return_soft_tfidf_score:
|
|||||||
|
|
||||||
|
|
||||||
double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options, size_t *num_matches) {
|
double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options, size_t *num_matches) {
|
||||||
return soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, NULL, num_tokens2, tokens2, token_scores2, NULL, NULL, options, num_matches);
|
return soft_tfidf_similarity_with_phrases_and_acronyms(num_tokens1, tokens1, token_scores1, NULL, num_tokens2, tokens2, token_scores2, NULL, NULL, NULL, options, num_matches);
|
||||||
}
|
}
|
||||||
@@ -45,7 +45,7 @@ typedef struct soft_tfidf_options {
|
|||||||
|
|
||||||
soft_tfidf_options_t soft_tfidf_default_options(void);
|
soft_tfidf_options_t soft_tfidf_default_options(void);
|
||||||
|
|
||||||
double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, soft_tfidf_options_t options, size_t *num_matches);
|
double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, phrase_array *multi_word_alignments, soft_tfidf_options_t options, size_t *num_matches);
|
||||||
double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options, size_t *num_matches);
|
double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options, size_t *num_matches);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
Reference in New Issue
Block a user