From 2106a6cfe4f44ebc57e14ff3e09ad5d98edad76c Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 8 Aug 2015 19:44:50 -0400 Subject: [PATCH] [build] Adding command-line test and bench programs --- src/Makefile.am | 7 ++++- src/bench.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++ src/main.c | 65 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 src/bench.c create mode 100644 src/main.c diff --git a/src/Makefile.am b/src/Makefile.am index 070601da..5c1d7e13 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -19,8 +19,13 @@ noinst_LTLIBRARIES = libscanner.la libscanner_la_SOURCES = scanner.c libscanner_la_CFLAGS = $(CFLAGS_O0) +noinst_PROGRAMS = libpostal bench +libpostal_SOURCES = main.c +libpostal_LDADD = libpostal.la +bench_SOURCES = bench.c +bench_LDADD = libpostal.la libscanner.la + pkginclude_HEADERS = libpostal.h -pkgdata_DATA = libpostal_data.tar.gz LIBPOSTAL_S3_BUCKET_NAME = libpostal LIBPOSTAL_S3_BUCKET_URL = http://$(LIBPOSTAL_S3_BUCKET_NAME).s3.amazonaws.com diff --git a/src/bench.c b/src/bench.c new file mode 100644 index 00000000..323e57ab --- /dev/null +++ b/src/bench.c @@ -0,0 +1,82 @@ +#include +#include +#include +#include + +#include "libpostal.h" +#include "log/log.h" +#include "scanner.h" + +int main(int argc, char **argv) { + if (argc < 3) { + log_error("Usage: test_libpostal string languages...\n"); + exit(EXIT_FAILURE); + } + + char *str = argv[1]; + char *languages[argc - 2]; + for (int i = 0; i < argc - 2; i++) { + char *arg = argv[i + 2]; + if (strlen(arg) >= MAX_LANGUAGE_LEN) { + printf("arg %d was longer than a language code (%d chars). Make sure to quote the input string\n", i + 2, MAX_LANGUAGE_LEN - 1); + } + languages[i] = arg; + } + + if (!libpostal_setup()) { + exit(EXIT_FAILURE); + } + + normalize_options_t options = { + .num_languages = 1, + .languages = languages, + .address_components = ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_UNIT, + .latin_ascii = 1, + .transliterate = 1, + .strip_accents = 1, + .decompose = 1, + .lowercase = 1, + .trim_string = 1, + .replace_word_hyphens = 1, + .delete_word_hyphens = 0, + .replace_numeric_hyphens = 0, + .delete_numeric_hyphens = 0, + .split_alpha_from_numeric = 1, + .delete_final_periods = 1, + .delete_acronym_periods = 1, + .drop_english_possessives = 1, + .delete_apostrophes = 1, + .expand_numex = 1, + .roman_numerals = 1 + }; + + uint64_t num_expansions; + + char **strings; + char *normalized; + + int num_loops = 100000; + + token_array *tokens = tokenize(str); + uint64_t num_tokens = tokens->n; + token_array_destroy(tokens); + + clock_t t1 = clock(); + for (int i = 0; i < num_loops; i++) { + strings = expand_address(str, options, &num_expansions); + for (uint64_t i = 0; i < num_expansions; i++) { + normalized = strings[i]; + free(normalized); + } + free(strings); + } + clock_t t2 = clock(); + + double benchmark_time = (double)(t2 - t1) / CLOCKS_PER_SEC; + printf("Benchmark time: %f\n", benchmark_time); + double addresses_per_second = num_loops / benchmark_time; + printf("addresses/s = %f\n", addresses_per_second); + double tokens_per_second = (num_loops * num_tokens) / benchmark_time; + printf("tokens/s = %f\n", tokens_per_second); + libpostal_teardown(); +} diff --git a/src/main.c b/src/main.c new file mode 100644 index 00000000..e803a5ef --- /dev/null +++ b/src/main.c @@ -0,0 +1,65 @@ +#include +#include +#include +#include + +#include "libpostal.h" +#include "log/log.h" + +int main(int argc, char **argv) { + if (argc < 3) { + log_error("Usage: test_libpostal string languages...\n"); + exit(EXIT_FAILURE); + } + char *str = argv[1]; + char *languages[argc - 2]; + for (int i = 0; i < argc - 2; i++) { + char *arg = argv[i + 2]; + if (strlen(arg) >= MAX_LANGUAGE_LEN) { + printf("arg %d was longer than a language code (%d chars). Make sure to quote the input string\n", i + 2, MAX_LANGUAGE_LEN - 1); + } + languages[i] = arg; + } + + if (!libpostal_setup()) { + exit(EXIT_FAILURE); + } + + normalize_options_t options = { + .num_languages = 1, + .languages = languages, + .address_components = ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_UNIT, + .latin_ascii = 1, + .transliterate = 1, + .strip_accents = 1, + .decompose = 1, + .lowercase = 1, + .trim_string = 1, + .replace_word_hyphens = 1, + .delete_word_hyphens = 0, + .replace_numeric_hyphens = 0, + .delete_numeric_hyphens = 0, + .split_alpha_from_numeric = 1, + .delete_final_periods = 1, + .delete_acronym_periods = 1, + .drop_english_possessives = 1, + .delete_apostrophes = 1, + .expand_numex = 1, + .roman_numerals = 1 + }; + + uint64_t num_expansions; + + char **strings = expand_address(str, options, &num_expansions); + + char *normalized; + for (uint64_t i = 0; i < num_expansions; i++) { + normalized = strings[i]; + printf("%s\n", normalized); + free(normalized); + } + + free(strings); + + libpostal_teardown(); +}