[build] Adding libpostal_data script for downloading data from S3, Makefile uses that now as part of the all-local target. Can be run periodically after install

This commit is contained in:
Al
2015-09-28 17:26:11 -04:00
parent f29f2f091b
commit abfa744d59
3 changed files with 72 additions and 49 deletions

View File

@@ -51,10 +51,4 @@ AC_CONFIG_FILES([Makefile
src/Makefile src/Makefile
src/sparkey/Makefile]) src/sparkey/Makefile])
AM_CONDITIONAL([HAVE_DATE_STAT], [date -r . >/dev/null 2>&1])
AM_CONDITIONAL([HAVE_STAT], [stat -f %Sm . >/dev/null 2>&1])
LAST_UPDATED_PATH=$srcdir/libpostal_data_last_updated
AC_SUBST([LIBPOSTAL_DATA_UPDATED_PATH], $LAST_UPDATED_PATH)
AC_OUTPUT AC_OUTPUT

View File

@@ -14,6 +14,8 @@ libpostal_la_SOURCES = libpostal.c address_dictionary.c transliterate.c tokens.c
libpostal_la_LIBADD = libscanner.la sparkey/libsparkey.la libpostal_la_LIBADD = libscanner.la sparkey/libsparkey.la
libpostal_la_CFLAGS = $(CFLAGS_O2) libpostal_la_CFLAGS = $(CFLAGS_O2)
dist_bin_SCRIPTS = libpostal_data
# Scanner can take a very long time to compile with higher optimization levels, so always use -O0, scanner is fast enough # Scanner can take a very long time to compile with higher optimization levels, so always use -O0, scanner is fast enough
noinst_LTLIBRARIES = libscanner.la noinst_LTLIBRARIES = libscanner.la
libscanner_la_SOURCES = scanner.c libscanner_la_SOURCES = scanner.c
@@ -22,63 +24,29 @@ libscanner_la_CFLAGS = $(CFLAGS_O0)
noinst_PROGRAMS = libpostal bench build_address_dictionary build_geodb build_numex_table build_trans_table noinst_PROGRAMS = libpostal bench build_address_dictionary build_geodb build_numex_table build_trans_table
libpostal_SOURCES = main.c libpostal_SOURCES = main.c
libpostal_LDADD = libpostal.la libpostal_LDADD = libpostal.la
libpostal_CFLAGS = $(CFLAGS_O3)
bench_SOURCES = bench.c bench_SOURCES = bench.c
bench_LDADD = libpostal.la libscanner.la bench_LDADD = libpostal.la libscanner.la
bench_CFLAGS = $(CFLAGS_O3)
build_address_dictionary_SOURCES = address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c build_address_dictionary_SOURCES = address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c
build_address_dictionary_CFLAGS = $(CFLAGS_O3)
build_geodb_SOURCES = geodb_builder.c geodb.c geo_disambiguation.c normalize.c bloom.c features.c geonames.c geohash/geohash.c unicode_scripts.c transliterate.c trie.c trie_search.c string_utils.c msgpack_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c build_geodb_SOURCES = geodb_builder.c geodb.c geo_disambiguation.c normalize.c bloom.c features.c geonames.c geohash/geohash.c unicode_scripts.c transliterate.c trie.c trie_search.c string_utils.c msgpack_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c
build_geodb_LDADD = sparkey/libsparkey.la build_geodb_LDADD = sparkey/libsparkey.la
build_geodb_CFLAGS = $(CFLAGS_O3)
build_numex_table_SOURCES = numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c build_numex_table_SOURCES = numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c
build_numex_table_CFLAGS = $(CFLAGS_O3)
build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c
build_trans_table_CFLAGS = $(CFLAGS_O3)
pkginclude_HEADERS = libpostal.h pkginclude_HEADERS = libpostal.h
pkgdata_DATA = libpostal_data.tar.gz
LIBPOSTAL_S3_BUCKET_NAME = libpostal
LIBPOSTAL_S3_BUCKET_URL = http://$(LIBPOSTAL_S3_BUCKET_NAME).s3.amazonaws.com
LIBPOSTAL_DATA_FILE = libpostal_data.tar.gz
LIBPOSTAL_DATA_DIR = $(datadir)/libpostal LIBPOSTAL_DATA_DIR = $(datadir)/libpostal
EPOCH_DATE = Jan 1 00:00:00 1970 all-local:
./libpostal_data download $(LIBPOSTAL_DATA_DIR)
if HAVE_DATE_STAT
USE_DATE_STAT = 1
else
if HAVE_STAT
USE_STAT = 1
else
$(error Cannot get file modification date on this platform);
endif
endif
libpostal_data_mkdir:
mkdir -p $(LIBPOSTAL_DATA_DIR)
libpostal_data_updated:
if [ ! -e @LIBPOSTAL_DATA_UPDATED_PATH@ ]; then \
echo "$(EPOCH_DATE)" > @LIBPOSTAL_DATA_UPDATED_PATH@; \
fi;
libpostal_data.tar.gz: | libpostal_data_mkdir libpostal_data_updated
if [ $$(curl $(LIBPOSTAL_S3_BUCKET_URL)/$(LIBPOSTAL_DATA_FILE) -z "$$(cat @LIBPOSTAL_DATA_UPDATED_PATH@)" --silent --remote-time -o $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) -w %{http_code}) = "200" ]; then \
if [ "x$(USE_DATE_STAT)" != "x" ]; then \
echo $$(date -d "$$(date -d "@$$(date -r $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) +%s)") + 1 second") > @LIBPOSTAL_DATA_UPDATED_PATH@; \
elif [ "x$(USE_STAT)" != "x" ]; then \
echo $$(date -r $$(stat -f %m $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE)) -v+1S) > @LIBPOSTAL_DATA_UPDATED_PATH@; \
fi; \
tar -xvzf $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) -C $(LIBPOSTAL_DATA_DIR); \
rm $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE); \
fi;
upload_data_s3:
tar -C $(LIBPOSTAL_DATA_DIR) -cvzf $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) address_expansions numex transliteration
aws s3 cp --acl=public-read $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) s3://$(LIBPOSTAL_S3_BUCKET_NAME)
lexer: scanner.re lexer: scanner.re
re2c -F -s -b -8 -o scanner.c scanner.re re2c -F -s -b -8 -o scanner.c scanner.re
.PHONY: lexer upload_data_s3 libpostal_data_mkdir libpostal_data_updated libpostal_data.tar.gz .PHONY: lexer

61
src/libpostal_data Executable file
View File

@@ -0,0 +1,61 @@
#!/usr/bin/env bash
if [ "$#" -ne 2 ]; then
echo "Usage: ./libpostal_data [upload|download] data_dir"
fi
LIBPOSTAL_S3_BUCKET_NAME="libpostal"
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"
LIBPOSTAL_S3_BUCKET_URL="http://$LIBPOSTAL_S3_BUCKET_NAME.s3.amazonaws.com"
LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
COMMAND=$1
LIBPOSTAL_DATA_DIR=$2
LIBPOSTAL_DATA_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated
BASIC_MODULE_DIRS=(address_expansins numex transliteration)
EPOCH_DATE="Jan 1 00:00:00 1970"
download_file() {
updated_path=$1
data_dir=$2
filename=$3
name=$4
local_path=$data_dir/$filename
if [ ! -e $updated_path ]; then
echo "$EPOCH_DATE" > $updated_path;
fi;
echo "Checking for new libpostal $name..."
if [ $(curl $LIBPOSTAL_S3_BUCKET_URL/$filename -z "$(cat $updated_path)" --silent --remote-time -o $local_path -w %{http_code}) = "200" ]; then
echo "New libpostal $name available"
if date -r . >/dev/null 2>&1; then
echo $(date -d "$(date -d "@$(date -r $local_path +%s)") + 1 second") > $updated_path;
elif stat -f %Sm . >/dev/null 2>&1; then
echo $(date -r $(stat -f %m $local_path) -v+1S) > $updated_path;
fi;
tar -xvzf $local_path -C $data_dir;
rm $local_path;
else
echo "libpostal $name up to date"
fi
}
if [ $COMMAND = "download" ]; then
mkdir -p $LIBPOSTAL_DATA_DIR
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE "data file"
elif [ $COMMAND = "upload" ]; then
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE ${BASIC_MODULE_DIRS[*]}
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY
else
echo "Invalid command: $COMMAND"
exit 1
fi