[dedupe] for strict abbreviations (defined as sharing a prefix and a suffix, and containing matches+gaps only by the subtotaling affine gap measure), using the greater of the two scores. This accounts for cases where the abbreviated version may have a much higher weight in one string than the non-abbreviated version does in the other. Same for acronym alignments. Making sure there's a common prefix in regular abbeviation detection Capping the Soft-TFIDF similarity at 1.0.
This commit is contained in:
@@ -11,7 +11,9 @@ static soft_tfidf_options_t DEFAULT_SOFT_TFIDF_OPTIONS = {
|
|||||||
.jaro_winkler_min_length = 4,
|
.jaro_winkler_min_length = 4,
|
||||||
.damerau_levenshtein_max = 1,
|
.damerau_levenshtein_max = 1,
|
||||||
.damerau_levenshtein_min_length = 4,
|
.damerau_levenshtein_min_length = 4,
|
||||||
.possible_affine_gap_abbreviations = true
|
.possible_affine_gap_abbreviations = true,
|
||||||
|
.strict_abbreviation_min_length = 4,
|
||||||
|
.strict_abbreviation_sim = 0.99
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@@ -210,6 +212,7 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
size_t jaro_winkler_min_length = options.jaro_winkler_min_length;
|
size_t jaro_winkler_min_length = options.jaro_winkler_min_length;
|
||||||
size_t damerau_levenshtein_max = options.damerau_levenshtein_max;
|
size_t damerau_levenshtein_max = options.damerau_levenshtein_max;
|
||||||
size_t damerau_levenshtein_min_length = options.damerau_levenshtein_min_length;
|
size_t damerau_levenshtein_min_length = options.damerau_levenshtein_min_length;
|
||||||
|
size_t strict_abbreviation_min_length = options.strict_abbreviation_min_length;
|
||||||
|
|
||||||
bool possible_affine_gap_abbreviations = options.possible_affine_gap_abbreviations;
|
bool possible_affine_gap_abbreviations = options.possible_affine_gap_abbreviations;
|
||||||
|
|
||||||
@@ -232,6 +235,7 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
size_t last_abbreviation = 0;
|
size_t last_abbreviation = 0;
|
||||||
double last_abbreviation_sim = 0.0;
|
double last_abbreviation_sim = 0.0;
|
||||||
bool have_abbreviation = false;
|
bool have_abbreviation = false;
|
||||||
|
bool have_strict_abbreviation = false;
|
||||||
bool have_acronym_match = false;
|
bool have_acronym_match = false;
|
||||||
phrase_t acronym_phrase = NULL_PHRASE;
|
phrase_t acronym_phrase = NULL_PHRASE;
|
||||||
bool have_phrase_match = false;
|
bool have_phrase_match = false;
|
||||||
@@ -240,6 +244,7 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
phrase_t argmax_phrase = NULL_PHRASE;
|
phrase_t argmax_phrase = NULL_PHRASE;
|
||||||
|
|
||||||
bool use_jaro_winkler = t1_len >= jaro_winkler_min_length;
|
bool use_jaro_winkler = t1_len >= jaro_winkler_min_length;
|
||||||
|
bool use_strict_abbreviation_sim = t1_len >= strict_abbreviation_min_length;
|
||||||
bool use_damerau_levenshtein = damerau_levenshtein_max > 0 && t1_len >= damerau_levenshtein_min_length;
|
bool use_damerau_levenshtein = damerau_levenshtein_max > 0 && t1_len >= damerau_levenshtein_min_length;
|
||||||
log_debug("use_jaro_winkler = %d, use_damerau_levenshtein=%d\n", use_jaro_winkler, use_damerau_levenshtein);
|
log_debug("use_jaro_winkler = %d, use_damerau_levenshtein=%d\n", use_jaro_winkler, use_damerau_levenshtein);
|
||||||
|
|
||||||
@@ -321,6 +326,9 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
last_abbreviation = j;
|
last_abbreviation = j;
|
||||||
last_abbreviation_sim = jaro_winkler;
|
last_abbreviation_sim = jaro_winkler;
|
||||||
have_abbreviation = true;
|
have_abbreviation = true;
|
||||||
|
if (use_strict_abbreviation_sim && possible_abbreviation_unicode_strict(t1u, t2u)) {
|
||||||
|
last_abbreviation_sim = last_abbreviation_sim > options.strict_abbreviation_sim ? last_abbreviation_sim : options.strict_abbreviation_sim;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -333,11 +341,23 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
if (have_equal || (use_jaro_winkler && (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)))) {
|
if (have_equal || (use_jaro_winkler && (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)))) {
|
||||||
log_debug("jaro-winkler, max_sim = %f\n", max_sim);
|
log_debug("jaro-winkler, max_sim = %f\n", max_sim);
|
||||||
t2_score = token_scores2[argmax_sim];
|
t2_score = token_scores2[argmax_sim];
|
||||||
total_sim += max_sim * t1_score * t2_score;
|
double jaro_winkler_sim = 0.0;
|
||||||
|
if (have_abbreviation && argmax_sim == last_abbreviation) {
|
||||||
|
double abbrev_sim = last_abbreviation_sim > max_sim ? last_abbreviation_sim : max_sim;
|
||||||
|
log_debug("have abbreviation, max(max_sim, last_abbreviation_sim) = %f\n", abbrev_sim);
|
||||||
|
double max_score = t1_score > t2_score ? t1_score : t2_score;
|
||||||
|
jaro_winkler_sim = abbrev_sim * max_score * max_score;
|
||||||
|
} else {
|
||||||
|
jaro_winkler_sim = max_sim * t1_score * t2_score;
|
||||||
|
}
|
||||||
|
total_sim += jaro_winkler_sim;
|
||||||
matched_tokens++;
|
matched_tokens++;
|
||||||
} else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) {
|
} else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) {
|
||||||
log_debug("levenshtein, argmin_dist_sim = %f\n", argmin_dist_sim);
|
log_debug("levenshtein, argmin_dist_sim = %f\n", argmin_dist_sim);
|
||||||
t2_score = token_scores2[argmin_dist];
|
t2_score = token_scores2[argmin_dist];
|
||||||
|
if (have_abbreviation && argmin_dist == last_abbreviation) {
|
||||||
|
argmin_dist_sim = last_abbreviation_sim > argmin_dist_sim ? last_abbreviation_sim : argmin_dist_sim;
|
||||||
|
}
|
||||||
total_sim += argmin_dist_sim * t1_score * t2_score;
|
total_sim += argmin_dist_sim * t1_score * t2_score;
|
||||||
matched_tokens++;
|
matched_tokens++;
|
||||||
} else if (possible_affine_gap_abbreviations && have_abbreviation) {
|
} else if (possible_affine_gap_abbreviations && have_abbreviation) {
|
||||||
@@ -371,7 +391,11 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
acronym_score += t2_score * t2_score;
|
acronym_score += t2_score * t2_score;
|
||||||
}
|
}
|
||||||
|
|
||||||
total_sim += t1_score * sqrt(acronym_score);
|
acronym_score = sqrt(acronym_score);
|
||||||
|
|
||||||
|
double max_acronym_score = t1_score > acronym_score ? t1_score : acronym_score;
|
||||||
|
|
||||||
|
total_sim += max_acronym_score * max_acronym_score;
|
||||||
|
|
||||||
log_debug("have_acronym_match\n");
|
log_debug("have_acronym_match\n");
|
||||||
matched_tokens++;
|
matched_tokens++;
|
||||||
@@ -420,7 +444,8 @@ return_soft_tfidf_score:
|
|||||||
double norm = double_array_l2_norm(token_scores1, num_tokens1) * double_array_l2_norm(token_scores2, num_tokens2);
|
double norm = double_array_l2_norm(token_scores1, num_tokens1) * double_array_l2_norm(token_scores2, num_tokens2);
|
||||||
log_debug("total_sim = %f, norm = %f\n", total_sim, norm);
|
log_debug("total_sim = %f, norm = %f\n", total_sim, norm);
|
||||||
|
|
||||||
return total_sim / norm;
|
double total_sim_norm = total_sim / norm;
|
||||||
|
return total_sim_norm > 1.0 ? 1.0 : total_sim_norm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -39,6 +39,8 @@ typedef struct soft_tfidf_options {
|
|||||||
size_t damerau_levenshtein_max;
|
size_t damerau_levenshtein_max;
|
||||||
size_t damerau_levenshtein_min_length;
|
size_t damerau_levenshtein_min_length;
|
||||||
bool possible_affine_gap_abbreviations;
|
bool possible_affine_gap_abbreviations;
|
||||||
|
size_t strict_abbreviation_min_length;
|
||||||
|
double strict_abbreviation_sim;
|
||||||
} soft_tfidf_options_t;
|
} soft_tfidf_options_t;
|
||||||
|
|
||||||
soft_tfidf_options_t soft_tfidf_default_options(void);
|
soft_tfidf_options_t soft_tfidf_default_options(void);
|
||||||
|
|||||||
@@ -299,7 +299,9 @@ bool possible_abbreviation_unicode_with_edits(uint32_array *u1_array, uint32_arr
|
|||||||
inline bool possible_abbreviation_unicode(uint32_array *u1_array, uint32_array *u2_array) {
|
inline bool possible_abbreviation_unicode(uint32_array *u1_array, uint32_array *u2_array) {
|
||||||
affine_gap_edits_t edits = affine_gap_distance_unicode(u1_array, u2_array);
|
affine_gap_edits_t edits = affine_gap_distance_unicode(u1_array, u2_array);
|
||||||
|
|
||||||
return possible_abbreviation_unicode_with_edits(u1_array, u2_array, edits);
|
ssize_t prefix_len = unicode_common_prefix(u1_array, u2_array);
|
||||||
|
|
||||||
|
return prefix_len > 0 && possible_abbreviation_unicode_with_edits(u1_array, u2_array, edits);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user