diff --git a/src/libpostal.c b/src/libpostal.c index 03a7dd9d..15f20948 100644 --- a/src/libpostal.c +++ b/src/libpostal.c @@ -8,6 +8,7 @@ #include "address_dictionary.h" #include "address_parser.h" +#include "dedupe.h" #include "expand.h" #include "language_classifier.h" @@ -108,6 +109,47 @@ char **libpostal_place_languages(size_t num_components, char **labels, char **va return languages; } +libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_name_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_street_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_house_number_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_po_box_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_unit_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_floor_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) { + return is_postal_code_duplicate(value1, value2, options); +} + +libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options) { + return is_toponym_duplicate(num_components1, labels1, values1, num_components2, labels2, values2, options); +} + +libpostal_duplicate_status_similarity_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { + return is_name_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); +} + +libpostal_duplicate_status_similarity_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) { + return is_street_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options); +} + + void libpostal_address_parser_response_destroy(libpostal_address_parser_response_t *self) { if (self == NULL) return; diff --git a/src/libpostal.h b/src/libpostal.h index 7c3b7e76..25de29c8 100644 --- a/src/libpostal.h +++ b/src/libpostal.h @@ -200,6 +200,47 @@ LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_componen LIBPOSTAL_EXPORT char **libpostal_place_languages(size_t num_components, char **labels, char **values, size_t *num_languages); +// Pairwise dupe methods + +typedef enum { + LIBPOSTAL_NULL_DUPLICATE_STATUS = -1, + LIBPOSTAL_NON_DUPLICATE = 0, + LIBPOSTAL_POSSIBLE_DUPLICATE_NEEDS_REVIEW = 3, + LIBPOSTAL_LIKELY_DUPLICATE = 6, + LIBPOSTAL_EXACT_DUPLICATE = 9, +} libpostal_duplicate_status_t; + +typedef struct libpostal_duplicate_options { + size_t num_languages; + char **languages; +} libpostal_duplicate_options_t; + +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_po_box_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_unit_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_floor_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options); + +// Pairwise fuzzy dupe methods, return status & similarity + +typedef struct libpostal_duplicate_similarity_options { + size_t num_languages; + char **languages; + double needs_review_threshold; + double likely_dupe_threshold; +} libpostal_duplicate_similarity_options_t; + +typedef struct libpostal_duplicate_status_similarity { + libpostal_duplicate_status_t status; + double similarity; +} libpostal_duplicate_status_similarity_t; + +LIBPOSTAL_EXPORT libpostal_duplicate_status_similarity_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); +LIBPOSTAL_EXPORT libpostal_duplicate_status_similarity_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options); + // Setup/teardown methods LIBPOSTAL_EXPORT bool libpostal_setup(void);