diff --git a/configure.ac b/configure.ac index 0ee49830..dec1e24f 100644 --- a/configure.ac +++ b/configure.ac @@ -51,10 +51,4 @@ AC_CONFIG_FILES([Makefile src/Makefile src/sparkey/Makefile]) -AM_CONDITIONAL([HAVE_DATE_STAT], [date -r . >/dev/null 2>&1]) -AM_CONDITIONAL([HAVE_STAT], [stat -f %Sm . >/dev/null 2>&1]) - -LAST_UPDATED_PATH=$srcdir/libpostal_data_last_updated -AC_SUBST([LIBPOSTAL_DATA_UPDATED_PATH], $LAST_UPDATED_PATH) - AC_OUTPUT diff --git a/src/Makefile.am b/src/Makefile.am index e0f81a52..f7100217 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -14,6 +14,8 @@ libpostal_la_SOURCES = libpostal.c address_dictionary.c transliterate.c tokens.c libpostal_la_LIBADD = libscanner.la sparkey/libsparkey.la libpostal_la_CFLAGS = $(CFLAGS_O2) +dist_bin_SCRIPTS = libpostal_data + # Scanner can take a very long time to compile with higher optimization levels, so always use -O0, scanner is fast enough noinst_LTLIBRARIES = libscanner.la libscanner_la_SOURCES = scanner.c @@ -22,63 +24,29 @@ libscanner_la_CFLAGS = $(CFLAGS_O0) noinst_PROGRAMS = libpostal bench build_address_dictionary build_geodb build_numex_table build_trans_table libpostal_SOURCES = main.c libpostal_LDADD = libpostal.la +libpostal_CFLAGS = $(CFLAGS_O3) bench_SOURCES = bench.c bench_LDADD = libpostal.la libscanner.la +bench_CFLAGS = $(CFLAGS_O3) build_address_dictionary_SOURCES = address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c +build_address_dictionary_CFLAGS = $(CFLAGS_O3) build_geodb_SOURCES = geodb_builder.c geodb.c geo_disambiguation.c normalize.c bloom.c features.c geonames.c geohash/geohash.c unicode_scripts.c transliterate.c trie.c trie_search.c string_utils.c msgpack_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c build_geodb_LDADD = sparkey/libsparkey.la +build_geodb_CFLAGS = $(CFLAGS_O3) build_numex_table_SOURCES = numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c +build_numex_table_CFLAGS = $(CFLAGS_O3) build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c +build_trans_table_CFLAGS = $(CFLAGS_O3) pkginclude_HEADERS = libpostal.h -pkgdata_DATA = libpostal_data.tar.gz - -LIBPOSTAL_S3_BUCKET_NAME = libpostal -LIBPOSTAL_S3_BUCKET_URL = http://$(LIBPOSTAL_S3_BUCKET_NAME).s3.amazonaws.com -LIBPOSTAL_DATA_FILE = libpostal_data.tar.gz LIBPOSTAL_DATA_DIR = $(datadir)/libpostal -EPOCH_DATE = Jan 1 00:00:00 1970 +all-local: + ./libpostal_data download $(LIBPOSTAL_DATA_DIR) -if HAVE_DATE_STAT -USE_DATE_STAT = 1 - -else -if HAVE_STAT -USE_STAT = 1 - -else - $(error Cannot get file modification date on this platform); -endif - -endif - - -libpostal_data_mkdir: - mkdir -p $(LIBPOSTAL_DATA_DIR) - -libpostal_data_updated: - if [ ! -e @LIBPOSTAL_DATA_UPDATED_PATH@ ]; then \ - echo "$(EPOCH_DATE)" > @LIBPOSTAL_DATA_UPDATED_PATH@; \ - fi; - -libpostal_data.tar.gz: | libpostal_data_mkdir libpostal_data_updated - if [ $$(curl $(LIBPOSTAL_S3_BUCKET_URL)/$(LIBPOSTAL_DATA_FILE) -z "$$(cat @LIBPOSTAL_DATA_UPDATED_PATH@)" --silent --remote-time -o $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) -w %{http_code}) = "200" ]; then \ - if [ "x$(USE_DATE_STAT)" != "x" ]; then \ - echo $$(date -d "$$(date -d "@$$(date -r $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) +%s)") + 1 second") > @LIBPOSTAL_DATA_UPDATED_PATH@; \ - elif [ "x$(USE_STAT)" != "x" ]; then \ - echo $$(date -r $$(stat -f %m $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE)) -v+1S) > @LIBPOSTAL_DATA_UPDATED_PATH@; \ - fi; \ - tar -xvzf $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) -C $(LIBPOSTAL_DATA_DIR); \ - rm $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE); \ - fi; - -upload_data_s3: - tar -C $(LIBPOSTAL_DATA_DIR) -cvzf $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) address_expansions numex transliteration - aws s3 cp --acl=public-read $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) s3://$(LIBPOSTAL_S3_BUCKET_NAME) lexer: scanner.re re2c -F -s -b -8 -o scanner.c scanner.re -.PHONY: lexer upload_data_s3 libpostal_data_mkdir libpostal_data_updated libpostal_data.tar.gz +.PHONY: lexer diff --git a/src/libpostal_data b/src/libpostal_data new file mode 100755 index 00000000..2539c87f --- /dev/null +++ b/src/libpostal_data @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +if [ "$#" -ne 2 ]; then + echo "Usage: ./libpostal_data [upload|download] data_dir" +fi + +LIBPOSTAL_S3_BUCKET_NAME="libpostal" +LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME" +LIBPOSTAL_S3_BUCKET_URL="http://$LIBPOSTAL_S3_BUCKET_NAME.s3.amazonaws.com" +LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz" + + +COMMAND=$1 +LIBPOSTAL_DATA_DIR=$2 + +LIBPOSTAL_DATA_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated + +BASIC_MODULE_DIRS=(address_expansins numex transliteration) + +EPOCH_DATE="Jan 1 00:00:00 1970" + +download_file() { + updated_path=$1 + data_dir=$2 + filename=$3 + name=$4 + + local_path=$data_dir/$filename + + if [ ! -e $updated_path ]; then + echo "$EPOCH_DATE" > $updated_path; + fi; + + echo "Checking for new libpostal $name..." + + if [ $(curl $LIBPOSTAL_S3_BUCKET_URL/$filename -z "$(cat $updated_path)" --silent --remote-time -o $local_path -w %{http_code}) = "200" ]; then + echo "New libpostal $name available" + if date -r . >/dev/null 2>&1; then + echo $(date -d "$(date -d "@$(date -r $local_path +%s)") + 1 second") > $updated_path; + elif stat -f %Sm . >/dev/null 2>&1; then + echo $(date -r $(stat -f %m $local_path) -v+1S) > $updated_path; + fi; + tar -xvzf $local_path -C $data_dir; + rm $local_path; + else + echo "libpostal $name up to date" + fi +} + +if [ $COMMAND = "download" ]; then + mkdir -p $LIBPOSTAL_DATA_DIR + + download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE "data file" + +elif [ $COMMAND = "upload" ]; then + tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE ${BASIC_MODULE_DIRS[*]} + aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY +else + echo "Invalid command: $COMMAND" + exit 1 +fi