From b90c3dab4bbf73ba53cd93eac818c2b955ee99bc Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 28 Dec 2017 04:34:25 -0500 Subject: [PATCH] [similarity/dedupe] adding Soft-TFIDF implementation with several different fallback qualifiers for the max-sim function (Damerau-Levenshtein and libpostal's new bucketed affine gap method for detecting abbreviations), but keeping Jaro-Winkler as the secondary similarity function in the final distance metric. Overall this should results in higher similarity values when one of the tokens may not quite match the pure secondary threshold in terms of Jaro-Winkler but may match on one of the other criteria. --- src/soft_tfidf.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++ src/soft_tfidf.h | 46 +++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 src/soft_tfidf.c create mode 100644 src/soft_tfidf.h diff --git a/src/soft_tfidf.c b/src/soft_tfidf.c new file mode 100644 index 00000000..1bd43220 --- /dev/null +++ b/src/soft_tfidf.c @@ -0,0 +1,170 @@ +#include "soft_tfidf.h" +#include "float_utils.h" +#include "string_similarity.h" +#include "string_utils.h" + +static soft_tfidf_options_t DEFAULT_SOFT_TFIDF_OPTIONS = { + .jaro_winkler_min = 0.9, + .damerau_levenshtein_max = 1, + .damerau_levenshtein_min_length = 4, + .use_abbreviations = true +}; + + +soft_tfidf_options_t soft_tfidf_default_options(void) { + return DEFAULT_SOFT_TFIDF_OPTIONS; +} + + +double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options) { + if (token_scores1 == NULL || token_scores2 == NULL) return 0.0; + + if (num_tokens2 < num_tokens1) { + double *tmp_scores = token_scores1; + token_scores1 = token_scores2; + token_scores2 = tmp_scores; + char **tmp_tokens = tokens1; + tokens1 = tokens2; + tokens2 = tmp_tokens; + + size_t tmp_num_tokens = num_tokens1; + num_tokens1 = num_tokens2; + num_tokens2 = tmp_num_tokens; + } + + size_t len1 = num_tokens1; + size_t len2 = num_tokens2; + + double total_sim = 0.0; + + uint32_array **t1_tokens_unicode = NULL; + uint32_array **t2_tokens_unicode = NULL; + + uint32_array *t1_unicode; + uint32_array *t2_unicode; + + t1_tokens_unicode = calloc(len1, sizeof(uint32_array *)); + if (t1_tokens_unicode == NULL) { + total_sim = -1.0; + goto return_soft_tfidf_score; + } + for (size_t i = 0; i < len1; i++) { + t1_unicode = unicode_codepoints(tokens1[i]); + if (t1_unicode == NULL) { + total_sim = -1.0; + goto return_soft_tfidf_score; + } + t1_tokens_unicode[i] = t1_unicode; + } + + t2_tokens_unicode = calloc(len2, sizeof(uint32_array *)); + if (t2_tokens_unicode == NULL) { + total_sim = -1.0; + goto return_soft_tfidf_score; + } + + for (size_t i = 0; i < len2; i++) { + t2_unicode = unicode_codepoints(tokens2[i]); + if (t2_unicode == NULL) { + total_sim = -1.0; + goto return_soft_tfidf_score; + } + t2_tokens_unicode[i] = t2_unicode; + } + + double jaro_winkler_min = options.jaro_winkler_min; + size_t damerau_levenshtein_max = options.damerau_levenshtein_max; + size_t damerau_levenshtein_min_length = options.damerau_levenshtein_min_length; + bool use_damerau_levenshtein = damerau_levenshtein_max > 0 && len1 >= damerau_levenshtein_min_length; + + bool use_abbreviations = options.use_abbreviations; + + for (size_t i = 0; i < len1; i++) { + uint32_array *t1u = t1_tokens_unicode[i]; + uint32_array *t2u; + char *t1 = tokens1[i]; + double t1_score = token_scores1[i]; + + double max_sim = 0.0; + size_t min_dist = t1u->n; + size_t argmax_sim = 0; + size_t argmin_dist = 0; + double argmin_dist_sim = 0.0; + size_t last_abbreviation = 0; + double last_abbreviation_sim = 0.0; + bool have_abbreviation = false; + double t2_score; + + for (size_t j = 0; j < len2; j++) { + char *t2 = tokens2[j]; + t2u = t2_tokens_unicode[j]; + if (unicode_equals(t1u, t2u)) { + max_sim = 1.0; + argmax_sim = j; + break; + } + + double jaro_winkler = jaro_winkler_distance_unicode(t1u, t2u); + if (jaro_winkler > max_sim) { + max_sim = jaro_winkler; + argmax_sim = j; + } + + if (use_damerau_levenshtein) { + size_t replace_cost = 0; + ssize_t dist = damerau_levenshtein_distance_unicode(t1u, t2u, replace_cost); + if (dist >= 0 && dist < min_dist) { + min_dist = (size_t)dist; + argmin_dist = j; + argmin_dist_sim = jaro_winkler; + } + } + + if (use_abbreviations) { + bool is_abbreviation = possible_abbreviation_unicode(t1u, t2u); + if (is_abbreviation) { + last_abbreviation = j; + last_abbreviation_sim = jaro_winkler; + have_abbreviation = true; + } + } + } + + // Note: here edit distance, affine gap and abbreviations are only used in the thresholding process. + // Jaro-Winkler is still used to calculate similarity + + if (max_sim > jaro_winkler_min || double_equals(max_sim, jaro_winkler_min)) { + t2_score = token_scores2[argmax_sim]; + total_sim += max_sim * t1_score * t2_score; + } else if (use_damerau_levenshtein && min_dist <= damerau_levenshtein_max) { + t2_score = token_scores2[argmin_dist]; + total_sim += argmin_dist_sim * t1_score * t2_score; + } else if (use_abbreviations && have_abbreviation) { + t2_score = token_scores2[last_abbreviation]; + total_sim += last_abbreviation_sim * t1_score * t2_score; + } + } + +return_soft_tfidf_score: + if (t1_tokens_unicode != NULL) { + for (size_t i = 0; i < len1; i++) { + t1_unicode = t1_tokens_unicode[i]; + if (t1_unicode != NULL) { + uint32_array_destroy(t1_unicode); + } + } + free(t1_tokens_unicode); + } + + if (t2_tokens_unicode != NULL) { + for (size_t i = 0; i < len2; i++) { + t2_unicode = t2_tokens_unicode[i]; + if (t2_unicode != NULL) { + uint32_array_destroy(t2_unicode); + } + } + free(t2_tokens_unicode); + } + + return total_sim; +} \ No newline at end of file diff --git a/src/soft_tfidf.h b/src/soft_tfidf.h new file mode 100644 index 00000000..7d777fc5 --- /dev/null +++ b/src/soft_tfidf.h @@ -0,0 +1,46 @@ +#ifndef SOFT_TFIDF_H +#define SOFT_TFIDF_H + +#include +#include "collections.h" +#include "libpostal.h" + +/* +This is a variant of Soft-TFIDF as described in: + +Cohen, Ravikumar, and Fienberg. A comparison of string distance +metrics for name-matching tasks. (2003) +https://www.cs.cmu.edu/~wcohen/postscript/ijcai-ws-2003.pdf + +Soft TFIDF is a hybrid similarity function for strings, typically names, +which combines both global statistics (TF-IDF) and a local similarity +function (e.g. Jaro-Winkler, which the authors suggest performs best). + +Given two strings, s1 and s2, each token t1 in s1 is matched with its most +similar counterpart t2 in s2 according to the local distance function. + +The Soft-TFIDF similarity is then the dot product of the max token +similarities and the cosine similarity of the TF-IDF vectors for all tokens +if the max similarity is >= a given threshold theta. + +This version is a modified Soft-TFIDF. Jaro-Winkler is used as the secondary +distance metric. However, the defintion of two tokens being "similar" is +defined as either: + +1. Jaro-Winkler distance >= theta +2. Damerau-Levenshtein edit distance <= max_edit_distance +3. Affine gap edit counts indicate a possible abbreviation (# matches == min(len1, len2)) +*/ + +typedef struct soft_tfidf_options { + double jaro_winkler_min; + size_t damerau_levenshtein_max; + size_t damerau_levenshtein_min_length; + bool use_abbreviations; +} soft_tfidf_options_t; + +soft_tfidf_options_t soft_tfidf_default_options(void); + +double soft_tfidf_similarity(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, soft_tfidf_options_t options); + +#endif \ No newline at end of file