[dedupe/similarity] also utilizing the L2 norm in similarity when acronyms are detected. Similarity in this case should be the acronym token's score * the L2 norm of the expanded tokens' scores in the longer string
This commit is contained in:
@@ -362,11 +362,14 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
matched_tokens += p1.len;
|
matched_tokens += p1.len;
|
||||||
log_debug("have_phrase_match\n");
|
log_debug("have_phrase_match\n");
|
||||||
} else {
|
} else {
|
||||||
|
double acronym_score = 0.0;
|
||||||
for (size_t p = acronym_phrase.start; p < acronym_phrase.start + acronym_phrase.len; p++) {
|
for (size_t p = acronym_phrase.start; p < acronym_phrase.start + acronym_phrase.len; p++) {
|
||||||
t2_score = token_scores2[p];
|
t2_score = token_scores2[p];
|
||||||
total_sim += t1_score * t2_score;
|
acronym_score += t2_score * t2_score;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
total_sim += t1_score * sqrt(acronym_score);
|
||||||
|
|
||||||
log_debug("have_acronym_match\n");
|
log_debug("have_acronym_match\n");
|
||||||
matched_tokens++;
|
matched_tokens++;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user