diff --git a/src/dedupe.c b/src/dedupe.c index bbf613a0..94453f26 100644 --- a/src/dedupe.c +++ b/src/dedupe.c @@ -263,7 +263,7 @@ bool have_ideographic_word_tokens(token_array *token_array) { return false; } -libpostal_duplicate_status_similarity_t is_fuzzy_duplicate(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options, libpostal_normalize_options_t normalize_options, soft_tfidf_options_t soft_tfidf_options, bool do_acronyms) { +libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options, libpostal_normalize_options_t normalize_options, soft_tfidf_options_t soft_tfidf_options, bool do_acronyms) { normalize_options.num_languages = options.num_languages; normalize_options.languages = options.languages; @@ -362,10 +362,10 @@ libpostal_duplicate_status_similarity_t is_fuzzy_duplicate(size_t num_tokens1, c free(joined2); } - return (libpostal_duplicate_status_similarity_t){dupe_status, max_sim}; + return (libpostal_fuzzy_duplicate_status_t){dupe_status, max_sim}; } -inline libpostal_duplicate_status_similarity_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { +inline libpostal_fuzzy_duplicate_status_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME; @@ -377,7 +377,7 @@ inline libpostal_duplicate_status_similarity_t is_name_duplicate_fuzzy(size_t nu } -inline libpostal_duplicate_status_similarity_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { +inline libpostal_fuzzy_duplicate_status_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { libpostal_normalize_options_t normalize_options = libpostal_get_default_options(); normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET; diff --git a/src/dedupe.h b/src/dedupe.h index c9a4fdf8..5c40fb8c 100644 --- a/src/dedupe.h +++ b/src/dedupe.h @@ -16,8 +16,8 @@ libpostal_duplicate_status_t is_floor_duplicate(char *value1, char *value2, libp libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options); -libpostal_duplicate_status_similarity_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); -libpostal_duplicate_status_similarity_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); +libpostal_fuzzy_duplicate_status_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); +libpostal_fuzzy_duplicate_status_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); #endif \ No newline at end of file diff --git a/src/libpostal.c b/src/libpostal.c index 15f20948..9ac2dcab 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -79,7 +79,7 @@ static libpostal_near_dupe_hash_options_t LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIO .address_only_keys = false }; -libpostal_near_dupe_hash_options_t libpostal_near_dupe_hash_default_options(void) { +libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void) { return LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS; } @@ -109,6 +109,22 @@ char **libpostal_place_languages(size_t num_components, char **labels, char **va return languages; } +static libpostal_duplicate_options_t LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS = { + .num_languages = 0, + .languages = NULL +}; + +libpostal_duplicate_options_t libpostal_get_default_duplicate_options(void) { + return LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS; +} + +libpostal_duplicate_options_t libpostal_get_duplicate_options_with_languages(size_t num_languages, char **languages) { + libpostal_duplicate_options_t options = LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS; + options.num_languages = num_languages; + options.languages = languages; + return options; +} + libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { return is_name_duplicate(value1, value2, options); } @@ -141,11 +157,34 @@ libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_component return is_toponym_duplicate(num_components1, labels1, values1, num_components2, labels2, values2, options); } -libpostal_duplicate_status_similarity_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { +#define DEFAULT_FUZZY_DUPLICATE_NEEDS_REVIEW_THRESHOLD 0.7 +#define DEFAULT_FUZZY_DUPLICATE_LIKELY_DUPE_THRESHOLD 0.9 + +static libpostal_fuzzy_duplicate_options_t DEFAULT_FUZZY_DUPLICATE_OPTIONS = { + .num_languages = 0, + .languages = NULL, + .needs_review_threshold = DEFAULT_FUZZY_DUPLICATE_NEEDS_REVIEW_THRESHOLD, + .likely_dupe_threshold = DEFAULT_FUZZY_DUPLICATE_LIKELY_DUPE_THRESHOLD +}; + + +libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options(void) { + return DEFAULT_FUZZY_DUPLICATE_OPTIONS; +} + +libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages, char **languages) { + libpostal_fuzzy_duplicate_options_t options = DEFAULT_FUZZY_DUPLICATE_OPTIONS; + options.num_languages = num_languages; + options.languages = languages; + return options; +} + + +libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { return is_name_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); } -libpostal_duplicate_status_similarity_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { +libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) { return is_street_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); } diff --git a/src/libpostal.h b/src/libpostal.h index 25de29c8..76aa4ab5 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -192,7 +192,7 @@ typedef struct libpostal_near_dupe_hash_options { } libpostal_near_dupe_hash_options_t; -LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_near_dupe_hash_default_options(void); +LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void); LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes); LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes); @@ -215,6 +215,10 @@ typedef struct libpostal_duplicate_options { char **languages; } libpostal_duplicate_options_t; + +LIBPOSTAL_EXPORT libpostal_duplicate_options_t libpostal_get_default_duplicate_options(void); +LIBPOSTAL_EXPORT libpostal_duplicate_options_t libpostal_get_duplicate_options_with_languages(size_t num_languages, char **languages); + LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); @@ -226,20 +230,23 @@ LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_toponym_duplicate(siz // Pairwise fuzzy dupe methods, return status & similarity -typedef struct libpostal_duplicate_similarity_options { +typedef struct libpostal_fuzzy_duplicate_options { size_t num_languages; char **languages; double needs_review_threshold; double likely_dupe_threshold; -} libpostal_duplicate_similarity_options_t; +} libpostal_fuzzy_duplicate_options_t; typedef struct libpostal_duplicate_status_similarity { libpostal_duplicate_status_t status; double similarity; -} libpostal_duplicate_status_similarity_t; +} libpostal_fuzzy_duplicate_status_t; -LIBPOSTAL_EXPORT libpostal_duplicate_status_similarity_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); -LIBPOSTAL_EXPORT libpostal_duplicate_status_similarity_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); +LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options(void); +LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages, char **languages); + +LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options); // Setup/teardown methods diff --git a/src/near_dupe_test.c b/src/near_dupe_test.c index 18155dea..db7d67bf 100644 --- a/src/near_dupe_test.c +++ b/src/near_dupe_test.c @@ -14,7 +14,7 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - libpostal_near_dupe_hash_options_t options = libpostal_near_dupe_hash_default_options(); + libpostal_near_dupe_hash_options_t options = libpostal_get_near_dupe_hash_default_options(); cstring_array *labels_array = cstring_array_new(); cstring_array *values_array = cstring_array_new();