[similarity/dedupe] adding options for acronym alignments and address phrase matches in Soft-TFIDF. Acronym alignments will give higher similarity to NYU vs. "New York University" whereas phrase matches would match known phrases that share the same canonical like "Cty Rd" vs. "C.R." vs. "County Road" within the Soft-TFIDF similarity calculation.

This commit is contained in:
Al
2017-12-29 02:38:48 -05:00
parent 24a77ea03f
commit f1e6886536
2 changed files with 223 additions and 15 deletions

View File

@@ -4,6 +4,7 @@
#include <stdlib.h>
#include "collections.h"
#include "libpostal.h"
#include "trie_search.h"
/*
This is a variant of Soft-TFIDF as described in:
@@ -41,6 +42,7 @@ typedef struct soft_tfidf_options {
soft_tfidf_options_t soft_tfidf_default_options(void);
double soft_tfidf_similarity_with_phrases_and_acronyms(size_t num_tokens1, char **tokens1, double *token_scores1, phrase_array *phrases1, size_t num_tokens2, char **tokens2, double *token_scores2, phrase_array *phrases2, phrase_array *acronym_alignments, soft_tfidf_options_t options);
double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options);
#endif