[similarity] adding a match count in Soft-TFIDF to allow answering questions about subsets i.e. the set of tokens in "Park Pl" contain the set of tokens in "Park". Setting Jaro-Winkler minimum length of 4 chars on, more specific option name for possible abbeviation detection

This commit is contained in:
Al
2018-01-06 03:50:03 -05:00
parent 0cb488ecea
commit 4356174630
2 changed files with 36 additions and 14 deletions

View File

@@ -35,14 +35,15 @@ defined as either:
typedef struct soft_tfidf_options {
double jaro_winkler_min;
size_t jaro_winkler_min_length;
size_t damerau_levenshtein_max;
size_t damerau_levenshtein_min_length;
bool use_abbreviations;
bool possible_affine_gap_abbreviations;
} soft_tfidf_options_t;
soft_tfidf_options_t soft_tfidf_default_options(void);
double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, soft_tfidf_options_t options);
double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options);
double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, soft_tfidf_options_t options, size_t *num_matches);
double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options, size_t *num_matches);
#endif