From 8b75c44026aa8eeb7a72b9ebb21961b86469fec2 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 24 Dec 2017 12:41:44 -0500 Subject: [PATCH] [dedupe] adding a test program for near dupe hashing that simply prints out the results. Automated tests in the works --- src/near_dupe_test.c | 122 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 src/near_dupe_test.c diff --git a/src/near_dupe_test.c b/src/near_dupe_test.c new file mode 100644 index 00000000..18155dea --- /dev/null +++ b/src/near_dupe_test.c @@ -0,0 +1,122 @@ +#include +#include + +#include "libpostal.h" +#include "string_utils.h" + +int main(int argc, char **argv) { + if (argc < 3) { + printf("Usage: ./test_near_dupe label value [...]\n"); + exit(EXIT_FAILURE); + } + + if (!libpostal_setup() || !libpostal_setup_language_classifier()) { + exit(EXIT_FAILURE); + } + + libpostal_near_dupe_hash_options_t options = libpostal_near_dupe_hash_default_options(); + + cstring_array *labels_array = cstring_array_new(); + cstring_array *values_array = cstring_array_new(); + cstring_array *languages_array = NULL; + + bool label = true; + bool next_is_latitude = false; + bool next_is_longitude = false; + bool next_is_geohash_precision = false; + bool have_latitude = false; + bool have_longitude = false; + bool next_is_language = false; + double longitude = 0.0; + double latitude = 0.0; + + + for (size_t i = 1; i < argc; i++) { + char *arg = argv[i]; + + if (string_equals(arg, "--with-unit")) { + options.with_unit = true; + } else if (string_equals(arg, "--latitude")) { + next_is_latitude = true; + } else if (string_equals(arg, "--longitude")) { + next_is_longitude = true; + } else if (string_equals(arg, "--geohash-precision")) { + next_is_geohash_precision = true; + } else if (string_equals(arg, "--name-only-keys")) { + options.name_only_keys = true; + } else if (string_equals(arg, "--address-only-keys")) { + options.address_only_keys = true; + } else if (string_equals(arg, "--language")) { + next_is_language = true; + } else if (next_is_latitude) { + sscanf(arg, "%lf", &latitude); + next_is_latitude = false; + have_latitude = true; + } else if (next_is_longitude) { + sscanf(arg, "%lf", &longitude); + next_is_longitude = false; + have_longitude = true; + } else if (next_is_geohash_precision) { + size_t geohash_precision = 0; + sscanf(arg, "%zu", &geohash_precision); + options.geohash_precision = geohash_precision; + next_is_geohash_precision = false; + } else if (next_is_language) { + if (languages_array == NULL) { + languages_array = cstring_array_new(); + } + cstring_array_add_string(languages_array, arg); + } else if (label) { + cstring_array_add_string(labels_array, arg); + label = false; + } else { + cstring_array_add_string(values_array, arg); + label = true; + } + } + + if (have_latitude && have_longitude) { + options.with_latlon = true; + options.latitude = latitude; + options.longitude = longitude; + } + + size_t num_languages = 0; + char **languages = NULL; + if (languages_array != NULL) { + num_languages = cstring_array_num_strings(languages_array); + languages = cstring_array_to_strings(languages_array); + } + + + size_t num_components = cstring_array_num_strings(labels_array); + if (num_components != cstring_array_num_strings(values_array)) { + cstring_array_destroy(labels_array); + cstring_array_destroy(values_array); + printf("Must have same number of labels and values\n"); + exit(EXIT_FAILURE); + } + + char **labels = cstring_array_to_strings(labels_array); + char **values = cstring_array_to_strings(values_array); + + size_t num_near_dupe_hashes = 0; + char **near_dupe_hashes = libpostal_near_dupe_hashes_languages(num_components, labels, values, options, num_languages, languages, &num_near_dupe_hashes); + + for (size_t i = 0; i < num_near_dupe_hashes; i++) { + char *near_dupe_hash = near_dupe_hashes[i]; + printf("%s\n", near_dupe_hash); + } + + libpostal_expansion_array_destroy(near_dupe_hashes, num_near_dupe_hashes); + libpostal_expansion_array_destroy(labels, num_components); + libpostal_expansion_array_destroy(values, num_components); + + if (languages != NULL) { + libpostal_expansion_array_destroy(languages, num_languages); + } + + libpostal_teardown(); + libpostal_teardown_language_classifier(); + +}