[similarity/dedupe] normalizing by the product of the L2 norms in soft token similarity function, as in cosine similarity. Score vectors should be passed in unnormalized, and typically with unit length. Also, for aligned phrases that share the same canonical phrase, contribute the product of the two norms of the phrase vectors to the similarity's numerator (maximum value, as if each token in both strings had matched exactly). The previous version over-counted the importance of aligned multi-word phrases by doing a cross product, which could overshadow other more important terms.

This commit is contained in:
Al
2018-01-22 01:38:12 -05:00
parent 2883b57eb3
commit eb3fb37ad4

View File

@@ -1,4 +1,5 @@
#include "soft_tfidf.h" #include "soft_tfidf.h"
#include <math.h>
#include "address_dictionary.h" #include "address_dictionary.h"
#include "float_utils.h" #include "float_utils.h"
#include "string_similarity.h" #include "string_similarity.h"
@@ -221,6 +222,8 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
size_t t1_len = t1u->n; size_t t1_len = t1u->n;
log_debug("t1 = %s\n", tokens1[i]);
double max_sim = 0.0; double max_sim = 0.0;
size_t min_dist = t1_len; size_t min_dist = t1_len;
size_t argmax_sim = 0; size_t argmax_sim = 0;
@@ -244,8 +247,16 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
double t2_score; double t2_score;
log_debug("p1.len = %zu, i = %zu, p1.start = %zu\n", p1.len, i, p1.start);
if (p1.len > 0 && i > p1.start) {
log_debug("skipping token\n");
continue;
}
for (size_t j = 0; j < len2; j++) { for (size_t j = 0; j < len2; j++) {
t2u = t2_tokens_unicode[j]; t2u = t2_tokens_unicode[j];
log_debug("t2 = %s\n", tokens2[j]);
int64_t pm2 = phrase_memberships2 != NULL ? phrase_memberships2[j] : NULL_PHRASE_MEMBERSHIP; int64_t pm2 = phrase_memberships2 != NULL ? phrase_memberships2[j] : NULL_PHRASE_MEMBERSHIP;
phrase_t p2 = pm2 >= 0 ? phrases2->a[pm2] : NULL_PHRASE; phrase_t p2 = pm2 >= 0 ? phrases2->a[pm2] : NULL_PHRASE;
@@ -317,36 +328,50 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
if (!have_acronym_match && !have_phrase_match) { if (!have_acronym_match && !have_phrase_match) {
if (use_jaro_winkler && (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min))) { if (use_jaro_winkler && (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min))) {
log_debug("have max sim = %f\n", max_sim); log_debug("jaro-winkler, max_sim = %f\n", max_sim);
t2_score = token_scores2[argmax_sim]; t2_score = token_scores2[argmax_sim];
total_sim += max_sim * t1_score * t2_score; total_sim += max_sim * t1_score * t2_score;
matched_tokens++; matched_tokens++;
} else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) { } else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) {
log_debug("levenshtein\n"); log_debug("levenshtein, argmin_dist_sim = %f\n", argmin_dist_sim);
t2_score = token_scores2[argmin_dist]; t2_score = token_scores2[argmin_dist];
total_sim += argmin_dist_sim * t1_score * t2_score; total_sim += argmin_dist_sim * t1_score * t2_score;
matched_tokens++; matched_tokens++;
} else if (possible_affine_gap_abbreviations && have_abbreviation) { } else if (possible_affine_gap_abbreviations && have_abbreviation) {
log_debug("have abbreviation\n"); log_debug("have abbreviation, last_abbreviation_sim = %f\n", last_abbreviation_sim);
t2_score = token_scores2[last_abbreviation]; t2_score = token_scores2[last_abbreviation];
total_sim += last_abbreviation_sim * t1_score * t2_score; total_sim += last_abbreviation_sim * t1_score * t2_score;
matched_tokens++; matched_tokens++;
} }
} else if (have_phrase_match) { } else if (have_phrase_match) {
double p2_score = 0.0;
for (size_t p = argmax_phrase.start; p < argmax_phrase.start + argmax_phrase.len; p++) { for (size_t p = argmax_phrase.start; p < argmax_phrase.start + argmax_phrase.len; p++) {
t2_score = token_scores2[p]; t2_score = token_scores2[p];
total_sim += max_sim * t1_score * t2_score; p2_score += t2_score * t2_score;
} }
matched_tokens++;
double p1_score = 0.0;
for (size_t p = p1.start; p < p1.start + p1.len; p++) {
double t1_score_p = token_scores1[p];
p1_score += t1_score_p * t1_score_p;
}
total_sim += sqrt(p1_score) * sqrt(p2_score);
matched_tokens += p1.len;
log_debug("have_phrase_match\n"); log_debug("have_phrase_match\n");
} else { } else {
for (size_t p = acronym_phrase.start; p < acronym_phrase.start + acronym_phrase.len; p++) { for (size_t p = acronym_phrase.start; p < acronym_phrase.start + acronym_phrase.len; p++) {
t2_score = token_scores2[p]; t2_score = token_scores2[p];
total_sim += max_sim * t1_score * t2_score; total_sim += t1_score * t2_score;
} }
log_debug("have_acronym_match\n"); log_debug("have_acronym_match\n");
matched_tokens++; matched_tokens++;
} }
log_debug("total sim = %f\n", total_sim);
} }
log_debug("matched_tokens = %zu\n", matched_tokens); log_debug("matched_tokens = %zu\n", matched_tokens);
@@ -386,7 +411,10 @@ return_soft_tfidf_score:
int64_array_destroy(acronym_memberships_array); int64_array_destroy(acronym_memberships_array);
} }
return total_sim; double norm = double_array_l2_norm(token_scores1, num_tokens1) * double_array_l2_norm(token_scores2, num_tokens2);
log_debug("total_sim = %f, norm = %f\n", total_sim, norm);
return total_sim / norm;
} }