[api] adding APIs for getting default options and using a consistent naming convention
This commit is contained in:
@@ -263,7 +263,7 @@ bool have_ideographic_word_tokens(token_array *token_array) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
libpostal_duplicate_status_similarity_t is_fuzzy_duplicate(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options, libpostal_normalize_options_t normalize_options, soft_tfidf_options_t soft_tfidf_options, bool do_acronyms) {
|
libpostal_fuzzy_duplicate_status_t is_fuzzy_duplicate(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options, libpostal_normalize_options_t normalize_options, soft_tfidf_options_t soft_tfidf_options, bool do_acronyms) {
|
||||||
normalize_options.num_languages = options.num_languages;
|
normalize_options.num_languages = options.num_languages;
|
||||||
normalize_options.languages = options.languages;
|
normalize_options.languages = options.languages;
|
||||||
|
|
||||||
@@ -362,10 +362,10 @@ libpostal_duplicate_status_similarity_t is_fuzzy_duplicate(size_t num_tokens1, c
|
|||||||
free(joined2);
|
free(joined2);
|
||||||
}
|
}
|
||||||
|
|
||||||
return (libpostal_duplicate_status_similarity_t){dupe_status, max_sim};
|
return (libpostal_fuzzy_duplicate_status_t){dupe_status, max_sim};
|
||||||
}
|
}
|
||||||
|
|
||||||
inline libpostal_duplicate_status_similarity_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) {
|
inline libpostal_fuzzy_duplicate_status_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) {
|
||||||
libpostal_normalize_options_t normalize_options = libpostal_get_default_options();
|
libpostal_normalize_options_t normalize_options = libpostal_get_default_options();
|
||||||
normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME;
|
normalize_options.address_components = LIBPOSTAL_ADDRESS_NAME;
|
||||||
|
|
||||||
@@ -377,7 +377,7 @@ inline libpostal_duplicate_status_similarity_t is_name_duplicate_fuzzy(size_t nu
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
inline libpostal_duplicate_status_similarity_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) {
|
inline libpostal_fuzzy_duplicate_status_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) {
|
||||||
libpostal_normalize_options_t normalize_options = libpostal_get_default_options();
|
libpostal_normalize_options_t normalize_options = libpostal_get_default_options();
|
||||||
normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET;
|
normalize_options.address_components = LIBPOSTAL_ADDRESS_STREET;
|
||||||
|
|
||||||
|
|||||||
@@ -16,8 +16,8 @@ libpostal_duplicate_status_t is_floor_duplicate(char *value1, char *value2, libp
|
|||||||
libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
|
libpostal_duplicate_status_t is_postal_code_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
|
||||||
libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options);
|
libpostal_duplicate_status_t is_toponym_duplicate(size_t num_components1, char **labels1, char **values1, size_t num_components2, char **labels2, char **values2, libpostal_duplicate_options_t options);
|
||||||
|
|
||||||
libpostal_duplicate_status_similarity_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options);
|
libpostal_fuzzy_duplicate_status_t is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options);
|
||||||
libpostal_duplicate_status_similarity_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options);
|
libpostal_fuzzy_duplicate_status_t is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options);
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
@@ -79,7 +79,7 @@ static libpostal_near_dupe_hash_options_t LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIO
|
|||||||
.address_only_keys = false
|
.address_only_keys = false
|
||||||
};
|
};
|
||||||
|
|
||||||
libpostal_near_dupe_hash_options_t libpostal_near_dupe_hash_default_options(void) {
|
libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void) {
|
||||||
return LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS;
|
return LIBPOSTAL_NEAR_DUPE_HASH_DEFAULT_OPTIONS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -109,6 +109,22 @@ char **libpostal_place_languages(size_t num_components, char **labels, char **va
|
|||||||
return languages;
|
return languages;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static libpostal_duplicate_options_t LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS = {
|
||||||
|
.num_languages = 0,
|
||||||
|
.languages = NULL
|
||||||
|
};
|
||||||
|
|
||||||
|
libpostal_duplicate_options_t libpostal_get_default_duplicate_options(void) {
|
||||||
|
return LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS;
|
||||||
|
}
|
||||||
|
|
||||||
|
libpostal_duplicate_options_t libpostal_get_duplicate_options_with_languages(size_t num_languages, char **languages) {
|
||||||
|
libpostal_duplicate_options_t options = LIBPOSTAL_DUPLICATE_DEFAULT_OPTIONS;
|
||||||
|
options.num_languages = num_languages;
|
||||||
|
options.languages = languages;
|
||||||
|
return options;
|
||||||
|
}
|
||||||
|
|
||||||
libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
|
libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options) {
|
||||||
return is_name_duplicate(value1, value2, options);
|
return is_name_duplicate(value1, value2, options);
|
||||||
}
|
}
|
||||||
@@ -141,11 +157,34 @@ libpostal_duplicate_status_t libpostal_is_toponym_duplicate(size_t num_component
|
|||||||
return is_toponym_duplicate(num_components1, labels1, values1, num_components2, labels2, values2, options);
|
return is_toponym_duplicate(num_components1, labels1, values1, num_components2, labels2, values2, options);
|
||||||
}
|
}
|
||||||
|
|
||||||
libpostal_duplicate_status_similarity_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) {
|
#define DEFAULT_FUZZY_DUPLICATE_NEEDS_REVIEW_THRESHOLD 0.7
|
||||||
|
#define DEFAULT_FUZZY_DUPLICATE_LIKELY_DUPE_THRESHOLD 0.9
|
||||||
|
|
||||||
|
static libpostal_fuzzy_duplicate_options_t DEFAULT_FUZZY_DUPLICATE_OPTIONS = {
|
||||||
|
.num_languages = 0,
|
||||||
|
.languages = NULL,
|
||||||
|
.needs_review_threshold = DEFAULT_FUZZY_DUPLICATE_NEEDS_REVIEW_THRESHOLD,
|
||||||
|
.likely_dupe_threshold = DEFAULT_FUZZY_DUPLICATE_LIKELY_DUPE_THRESHOLD
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options(void) {
|
||||||
|
return DEFAULT_FUZZY_DUPLICATE_OPTIONS;
|
||||||
|
}
|
||||||
|
|
||||||
|
libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages, char **languages) {
|
||||||
|
libpostal_fuzzy_duplicate_options_t options = DEFAULT_FUZZY_DUPLICATE_OPTIONS;
|
||||||
|
options.num_languages = num_languages;
|
||||||
|
options.languages = languages;
|
||||||
|
return options;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) {
|
||||||
return is_name_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options);
|
return is_name_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options);
|
||||||
}
|
}
|
||||||
|
|
||||||
libpostal_duplicate_status_similarity_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options) {
|
libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options) {
|
||||||
return is_street_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options);
|
return is_street_duplicate_fuzzy(num_tokens1, tokens1, token_scores1, num_tokens2, tokens2, token_scores2, options);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -192,7 +192,7 @@ typedef struct libpostal_near_dupe_hash_options {
|
|||||||
} libpostal_near_dupe_hash_options_t;
|
} libpostal_near_dupe_hash_options_t;
|
||||||
|
|
||||||
|
|
||||||
LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_near_dupe_hash_default_options(void);
|
LIBPOSTAL_EXPORT libpostal_near_dupe_hash_options_t libpostal_get_near_dupe_hash_default_options(void);
|
||||||
LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes);
|
LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t *num_hashes);
|
||||||
LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes);
|
LIBPOSTAL_EXPORT char **libpostal_near_dupe_hashes_languages(size_t num_components, char **labels, char **values, libpostal_near_dupe_hash_options_t options, size_t num_languages, char **languages, size_t *num_hashes);
|
||||||
|
|
||||||
@@ -215,6 +215,10 @@ typedef struct libpostal_duplicate_options {
|
|||||||
char **languages;
|
char **languages;
|
||||||
} libpostal_duplicate_options_t;
|
} libpostal_duplicate_options_t;
|
||||||
|
|
||||||
|
|
||||||
|
LIBPOSTAL_EXPORT libpostal_duplicate_options_t libpostal_get_default_duplicate_options(void);
|
||||||
|
LIBPOSTAL_EXPORT libpostal_duplicate_options_t libpostal_get_duplicate_options_with_languages(size_t num_languages, char **languages);
|
||||||
|
|
||||||
LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
|
LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_name_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
|
||||||
LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
|
LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_street_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
|
||||||
LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
|
LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_house_number_duplicate(char *value1, char *value2, libpostal_duplicate_options_t options);
|
||||||
@@ -226,20 +230,23 @@ LIBPOSTAL_EXPORT libpostal_duplicate_status_t libpostal_is_toponym_duplicate(siz
|
|||||||
|
|
||||||
// Pairwise fuzzy dupe methods, return status & similarity
|
// Pairwise fuzzy dupe methods, return status & similarity
|
||||||
|
|
||||||
typedef struct libpostal_duplicate_similarity_options {
|
typedef struct libpostal_fuzzy_duplicate_options {
|
||||||
size_t num_languages;
|
size_t num_languages;
|
||||||
char **languages;
|
char **languages;
|
||||||
double needs_review_threshold;
|
double needs_review_threshold;
|
||||||
double likely_dupe_threshold;
|
double likely_dupe_threshold;
|
||||||
} libpostal_duplicate_similarity_options_t;
|
} libpostal_fuzzy_duplicate_options_t;
|
||||||
|
|
||||||
typedef struct libpostal_duplicate_status_similarity {
|
typedef struct libpostal_duplicate_status_similarity {
|
||||||
libpostal_duplicate_status_t status;
|
libpostal_duplicate_status_t status;
|
||||||
double similarity;
|
double similarity;
|
||||||
} libpostal_duplicate_status_similarity_t;
|
} libpostal_fuzzy_duplicate_status_t;
|
||||||
|
|
||||||
LIBPOSTAL_EXPORT libpostal_duplicate_status_similarity_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options);
|
LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options(void);
|
||||||
LIBPOSTAL_EXPORT libpostal_duplicate_status_similarity_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_duplicate_similarity_options_t options);
|
LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_options_t libpostal_get_default_fuzzy_duplicate_options_with_languages(size_t num_languages, char **languages);
|
||||||
|
|
||||||
|
LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_name_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options);
|
||||||
|
LIBPOSTAL_EXPORT libpostal_fuzzy_duplicate_status_t libpostal_is_street_duplicate_fuzzy(size_t num_tokens1, char **tokens1, double *token_scores1, size_t num_tokens2, char **tokens2, double *token_scores2, libpostal_fuzzy_duplicate_options_t options);
|
||||||
|
|
||||||
// Setup/teardown methods
|
// Setup/teardown methods
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ int main(int argc, char **argv) {
|
|||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
libpostal_near_dupe_hash_options_t options = libpostal_near_dupe_hash_default_options();
|
libpostal_near_dupe_hash_options_t options = libpostal_get_near_dupe_hash_default_options();
|
||||||
|
|
||||||
cstring_array *labels_array = cstring_array_new();
|
cstring_array *labels_array = cstring_array_new();
|
||||||
cstring_array *values_array = cstring_array_new();
|
cstring_array *values_array = cstring_array_new();
|
||||||
|
|||||||
Reference in New Issue
Block a user