[build] Adding command-line test and bench programs
This commit is contained in:
@@ -19,8 +19,13 @@ noinst_LTLIBRARIES = libscanner.la
|
|||||||
libscanner_la_SOURCES = scanner.c
|
libscanner_la_SOURCES = scanner.c
|
||||||
libscanner_la_CFLAGS = $(CFLAGS_O0)
|
libscanner_la_CFLAGS = $(CFLAGS_O0)
|
||||||
|
|
||||||
|
noinst_PROGRAMS = libpostal bench
|
||||||
|
libpostal_SOURCES = main.c
|
||||||
|
libpostal_LDADD = libpostal.la
|
||||||
|
bench_SOURCES = bench.c
|
||||||
|
bench_LDADD = libpostal.la libscanner.la
|
||||||
|
|
||||||
pkginclude_HEADERS = libpostal.h
|
pkginclude_HEADERS = libpostal.h
|
||||||
pkgdata_DATA = libpostal_data.tar.gz
|
|
||||||
|
|
||||||
LIBPOSTAL_S3_BUCKET_NAME = libpostal
|
LIBPOSTAL_S3_BUCKET_NAME = libpostal
|
||||||
LIBPOSTAL_S3_BUCKET_URL = http://$(LIBPOSTAL_S3_BUCKET_NAME).s3.amazonaws.com
|
LIBPOSTAL_S3_BUCKET_URL = http://$(LIBPOSTAL_S3_BUCKET_NAME).s3.amazonaws.com
|
||||||
|
|||||||
82
src/bench.c
Normal file
82
src/bench.c
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
#include "libpostal.h"
|
||||||
|
#include "log/log.h"
|
||||||
|
#include "scanner.h"
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
if (argc < 3) {
|
||||||
|
log_error("Usage: test_libpostal string languages...\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
char *str = argv[1];
|
||||||
|
char *languages[argc - 2];
|
||||||
|
for (int i = 0; i < argc - 2; i++) {
|
||||||
|
char *arg = argv[i + 2];
|
||||||
|
if (strlen(arg) >= MAX_LANGUAGE_LEN) {
|
||||||
|
printf("arg %d was longer than a language code (%d chars). Make sure to quote the input string\n", i + 2, MAX_LANGUAGE_LEN - 1);
|
||||||
|
}
|
||||||
|
languages[i] = arg;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!libpostal_setup()) {
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize_options_t options = {
|
||||||
|
.num_languages = 1,
|
||||||
|
.languages = languages,
|
||||||
|
.address_components = ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_UNIT,
|
||||||
|
.latin_ascii = 1,
|
||||||
|
.transliterate = 1,
|
||||||
|
.strip_accents = 1,
|
||||||
|
.decompose = 1,
|
||||||
|
.lowercase = 1,
|
||||||
|
.trim_string = 1,
|
||||||
|
.replace_word_hyphens = 1,
|
||||||
|
.delete_word_hyphens = 0,
|
||||||
|
.replace_numeric_hyphens = 0,
|
||||||
|
.delete_numeric_hyphens = 0,
|
||||||
|
.split_alpha_from_numeric = 1,
|
||||||
|
.delete_final_periods = 1,
|
||||||
|
.delete_acronym_periods = 1,
|
||||||
|
.drop_english_possessives = 1,
|
||||||
|
.delete_apostrophes = 1,
|
||||||
|
.expand_numex = 1,
|
||||||
|
.roman_numerals = 1
|
||||||
|
};
|
||||||
|
|
||||||
|
uint64_t num_expansions;
|
||||||
|
|
||||||
|
char **strings;
|
||||||
|
char *normalized;
|
||||||
|
|
||||||
|
int num_loops = 100000;
|
||||||
|
|
||||||
|
token_array *tokens = tokenize(str);
|
||||||
|
uint64_t num_tokens = tokens->n;
|
||||||
|
token_array_destroy(tokens);
|
||||||
|
|
||||||
|
clock_t t1 = clock();
|
||||||
|
for (int i = 0; i < num_loops; i++) {
|
||||||
|
strings = expand_address(str, options, &num_expansions);
|
||||||
|
for (uint64_t i = 0; i < num_expansions; i++) {
|
||||||
|
normalized = strings[i];
|
||||||
|
free(normalized);
|
||||||
|
}
|
||||||
|
free(strings);
|
||||||
|
}
|
||||||
|
clock_t t2 = clock();
|
||||||
|
|
||||||
|
double benchmark_time = (double)(t2 - t1) / CLOCKS_PER_SEC;
|
||||||
|
printf("Benchmark time: %f\n", benchmark_time);
|
||||||
|
double addresses_per_second = num_loops / benchmark_time;
|
||||||
|
printf("addresses/s = %f\n", addresses_per_second);
|
||||||
|
double tokens_per_second = (num_loops * num_tokens) / benchmark_time;
|
||||||
|
printf("tokens/s = %f\n", tokens_per_second);
|
||||||
|
libpostal_teardown();
|
||||||
|
}
|
||||||
65
src/main.c
Normal file
65
src/main.c
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
#include "libpostal.h"
|
||||||
|
#include "log/log.h"
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
if (argc < 3) {
|
||||||
|
log_error("Usage: test_libpostal string languages...\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
char *str = argv[1];
|
||||||
|
char *languages[argc - 2];
|
||||||
|
for (int i = 0; i < argc - 2; i++) {
|
||||||
|
char *arg = argv[i + 2];
|
||||||
|
if (strlen(arg) >= MAX_LANGUAGE_LEN) {
|
||||||
|
printf("arg %d was longer than a language code (%d chars). Make sure to quote the input string\n", i + 2, MAX_LANGUAGE_LEN - 1);
|
||||||
|
}
|
||||||
|
languages[i] = arg;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!libpostal_setup()) {
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize_options_t options = {
|
||||||
|
.num_languages = 1,
|
||||||
|
.languages = languages,
|
||||||
|
.address_components = ADDRESS_HOUSE_NUMBER | ADDRESS_STREET | ADDRESS_UNIT,
|
||||||
|
.latin_ascii = 1,
|
||||||
|
.transliterate = 1,
|
||||||
|
.strip_accents = 1,
|
||||||
|
.decompose = 1,
|
||||||
|
.lowercase = 1,
|
||||||
|
.trim_string = 1,
|
||||||
|
.replace_word_hyphens = 1,
|
||||||
|
.delete_word_hyphens = 0,
|
||||||
|
.replace_numeric_hyphens = 0,
|
||||||
|
.delete_numeric_hyphens = 0,
|
||||||
|
.split_alpha_from_numeric = 1,
|
||||||
|
.delete_final_periods = 1,
|
||||||
|
.delete_acronym_periods = 1,
|
||||||
|
.drop_english_possessives = 1,
|
||||||
|
.delete_apostrophes = 1,
|
||||||
|
.expand_numex = 1,
|
||||||
|
.roman_numerals = 1
|
||||||
|
};
|
||||||
|
|
||||||
|
uint64_t num_expansions;
|
||||||
|
|
||||||
|
char **strings = expand_address(str, options, &num_expansions);
|
||||||
|
|
||||||
|
char *normalized;
|
||||||
|
for (uint64_t i = 0; i < num_expansions; i++) {
|
||||||
|
normalized = strings[i];
|
||||||
|
printf("%s\n", normalized);
|
||||||
|
free(normalized);
|
||||||
|
}
|
||||||
|
|
||||||
|
free(strings);
|
||||||
|
|
||||||
|
libpostal_teardown();
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user