[fix] was missing some shorter tokens that are unicode equal in Soft-TFIDF
This commit is contained in:
@@ -243,6 +243,8 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
bool use_damerau_levenshtein = damerau_levenshtein_max > 0 && t1_len >= damerau_levenshtein_min_length;
|
bool use_damerau_levenshtein = damerau_levenshtein_max > 0 && t1_len >= damerau_levenshtein_min_length;
|
||||||
log_debug("use_jaro_winkler = %d, use_damerau_levenshtein=%d\n", use_jaro_winkler, use_damerau_levenshtein);
|
log_debug("use_jaro_winkler = %d, use_damerau_levenshtein=%d\n", use_jaro_winkler, use_damerau_levenshtein);
|
||||||
|
|
||||||
|
bool have_equal = false;
|
||||||
|
|
||||||
canonical_match_t best_canonical_phrase_response = CANONICAL_NO_MATCH;
|
canonical_match_t best_canonical_phrase_response = CANONICAL_NO_MATCH;
|
||||||
|
|
||||||
double t2_score;
|
double t2_score;
|
||||||
@@ -276,6 +278,7 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
if (unicode_equals(t1u, t2u)) {
|
if (unicode_equals(t1u, t2u)) {
|
||||||
max_sim = 1.0;
|
max_sim = 1.0;
|
||||||
argmax_sim = j;
|
argmax_sim = j;
|
||||||
|
have_equal = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -327,7 +330,7 @@ double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char
|
|||||||
// Jaro-Winkler is still used to calculate similarity
|
// Jaro-Winkler is still used to calculate similarity
|
||||||
|
|
||||||
if (!have_acronym_match && !have_phrase_match) {
|
if (!have_acronym_match && !have_phrase_match) {
|
||||||
if (use_jaro_winkler && (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min))) {
|
if (have_equal || (use_jaro_winkler && (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)))) {
|
||||||
log_debug("jaro-winkler, max_sim = %f\n", max_sim);
|
log_debug("jaro-winkler, max_sim = %f\n", max_sim);
|
||||||
t2_score = token_scores2[argmax_sim];
|
t2_score = token_scores2[argmax_sim];
|
||||||
total_sim += max_sim * t1_score * t2_score;
|
total_sim += max_sim * t1_score * t2_score;
|
||||||
|
|||||||
Reference in New Issue
Block a user