[build] Adding libpostal_data script for downloading data from S3, Makefile uses that now as part of the all-local target. Can be run periodically after install
This commit is contained in:
@@ -51,10 +51,4 @@ AC_CONFIG_FILES([Makefile
|
|||||||
src/Makefile
|
src/Makefile
|
||||||
src/sparkey/Makefile])
|
src/sparkey/Makefile])
|
||||||
|
|
||||||
AM_CONDITIONAL([HAVE_DATE_STAT], [date -r . >/dev/null 2>&1])
|
|
||||||
AM_CONDITIONAL([HAVE_STAT], [stat -f %Sm . >/dev/null 2>&1])
|
|
||||||
|
|
||||||
LAST_UPDATED_PATH=$srcdir/libpostal_data_last_updated
|
|
||||||
AC_SUBST([LIBPOSTAL_DATA_UPDATED_PATH], $LAST_UPDATED_PATH)
|
|
||||||
|
|
||||||
AC_OUTPUT
|
AC_OUTPUT
|
||||||
|
|||||||
@@ -14,6 +14,8 @@ libpostal_la_SOURCES = libpostal.c address_dictionary.c transliterate.c tokens.c
|
|||||||
libpostal_la_LIBADD = libscanner.la sparkey/libsparkey.la
|
libpostal_la_LIBADD = libscanner.la sparkey/libsparkey.la
|
||||||
libpostal_la_CFLAGS = $(CFLAGS_O2)
|
libpostal_la_CFLAGS = $(CFLAGS_O2)
|
||||||
|
|
||||||
|
dist_bin_SCRIPTS = libpostal_data
|
||||||
|
|
||||||
# Scanner can take a very long time to compile with higher optimization levels, so always use -O0, scanner is fast enough
|
# Scanner can take a very long time to compile with higher optimization levels, so always use -O0, scanner is fast enough
|
||||||
noinst_LTLIBRARIES = libscanner.la
|
noinst_LTLIBRARIES = libscanner.la
|
||||||
libscanner_la_SOURCES = scanner.c
|
libscanner_la_SOURCES = scanner.c
|
||||||
@@ -22,63 +24,29 @@ libscanner_la_CFLAGS = $(CFLAGS_O0)
|
|||||||
noinst_PROGRAMS = libpostal bench build_address_dictionary build_geodb build_numex_table build_trans_table
|
noinst_PROGRAMS = libpostal bench build_address_dictionary build_geodb build_numex_table build_trans_table
|
||||||
libpostal_SOURCES = main.c
|
libpostal_SOURCES = main.c
|
||||||
libpostal_LDADD = libpostal.la
|
libpostal_LDADD = libpostal.la
|
||||||
|
libpostal_CFLAGS = $(CFLAGS_O3)
|
||||||
bench_SOURCES = bench.c
|
bench_SOURCES = bench.c
|
||||||
bench_LDADD = libpostal.la libscanner.la
|
bench_LDADD = libpostal.la libscanner.la
|
||||||
|
bench_CFLAGS = $(CFLAGS_O3)
|
||||||
build_address_dictionary_SOURCES = address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c
|
build_address_dictionary_SOURCES = address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c
|
||||||
|
build_address_dictionary_CFLAGS = $(CFLAGS_O3)
|
||||||
build_geodb_SOURCES = geodb_builder.c geodb.c geo_disambiguation.c normalize.c bloom.c features.c geonames.c geohash/geohash.c unicode_scripts.c transliterate.c trie.c trie_search.c string_utils.c msgpack_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c
|
build_geodb_SOURCES = geodb_builder.c geodb.c geo_disambiguation.c normalize.c bloom.c features.c geonames.c geohash/geohash.c unicode_scripts.c transliterate.c trie.c trie_search.c string_utils.c msgpack_utils.c file_utils.c utf8proc/utf8proc.c cmp/cmp.c
|
||||||
build_geodb_LDADD = sparkey/libsparkey.la
|
build_geodb_LDADD = sparkey/libsparkey.la
|
||||||
|
build_geodb_CFLAGS = $(CFLAGS_O3)
|
||||||
build_numex_table_SOURCES = numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c
|
build_numex_table_SOURCES = numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c
|
||||||
|
build_numex_table_CFLAGS = $(CFLAGS_O3)
|
||||||
build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c
|
build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c
|
||||||
|
build_trans_table_CFLAGS = $(CFLAGS_O3)
|
||||||
|
|
||||||
pkginclude_HEADERS = libpostal.h
|
pkginclude_HEADERS = libpostal.h
|
||||||
pkgdata_DATA = libpostal_data.tar.gz
|
|
||||||
|
|
||||||
LIBPOSTAL_S3_BUCKET_NAME = libpostal
|
|
||||||
LIBPOSTAL_S3_BUCKET_URL = http://$(LIBPOSTAL_S3_BUCKET_NAME).s3.amazonaws.com
|
|
||||||
LIBPOSTAL_DATA_FILE = libpostal_data.tar.gz
|
|
||||||
|
|
||||||
LIBPOSTAL_DATA_DIR = $(datadir)/libpostal
|
LIBPOSTAL_DATA_DIR = $(datadir)/libpostal
|
||||||
|
|
||||||
EPOCH_DATE = Jan 1 00:00:00 1970
|
all-local:
|
||||||
|
./libpostal_data download $(LIBPOSTAL_DATA_DIR)
|
||||||
|
|
||||||
if HAVE_DATE_STAT
|
|
||||||
USE_DATE_STAT = 1
|
|
||||||
|
|
||||||
else
|
|
||||||
if HAVE_STAT
|
|
||||||
USE_STAT = 1
|
|
||||||
|
|
||||||
else
|
|
||||||
$(error Cannot get file modification date on this platform);
|
|
||||||
endif
|
|
||||||
|
|
||||||
endif
|
|
||||||
|
|
||||||
|
|
||||||
libpostal_data_mkdir:
|
|
||||||
mkdir -p $(LIBPOSTAL_DATA_DIR)
|
|
||||||
|
|
||||||
libpostal_data_updated:
|
|
||||||
if [ ! -e @LIBPOSTAL_DATA_UPDATED_PATH@ ]; then \
|
|
||||||
echo "$(EPOCH_DATE)" > @LIBPOSTAL_DATA_UPDATED_PATH@; \
|
|
||||||
fi;
|
|
||||||
|
|
||||||
libpostal_data.tar.gz: | libpostal_data_mkdir libpostal_data_updated
|
|
||||||
if [ $$(curl $(LIBPOSTAL_S3_BUCKET_URL)/$(LIBPOSTAL_DATA_FILE) -z "$$(cat @LIBPOSTAL_DATA_UPDATED_PATH@)" --silent --remote-time -o $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) -w %{http_code}) = "200" ]; then \
|
|
||||||
if [ "x$(USE_DATE_STAT)" != "x" ]; then \
|
|
||||||
echo $$(date -d "$$(date -d "@$$(date -r $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) +%s)") + 1 second") > @LIBPOSTAL_DATA_UPDATED_PATH@; \
|
|
||||||
elif [ "x$(USE_STAT)" != "x" ]; then \
|
|
||||||
echo $$(date -r $$(stat -f %m $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE)) -v+1S) > @LIBPOSTAL_DATA_UPDATED_PATH@; \
|
|
||||||
fi; \
|
|
||||||
tar -xvzf $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) -C $(LIBPOSTAL_DATA_DIR); \
|
|
||||||
rm $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE); \
|
|
||||||
fi;
|
|
||||||
|
|
||||||
upload_data_s3:
|
|
||||||
tar -C $(LIBPOSTAL_DATA_DIR) -cvzf $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) address_expansions numex transliteration
|
|
||||||
aws s3 cp --acl=public-read $(LIBPOSTAL_DATA_DIR)/$(LIBPOSTAL_DATA_FILE) s3://$(LIBPOSTAL_S3_BUCKET_NAME)
|
|
||||||
|
|
||||||
lexer: scanner.re
|
lexer: scanner.re
|
||||||
re2c -F -s -b -8 -o scanner.c scanner.re
|
re2c -F -s -b -8 -o scanner.c scanner.re
|
||||||
|
|
||||||
.PHONY: lexer upload_data_s3 libpostal_data_mkdir libpostal_data_updated libpostal_data.tar.gz
|
.PHONY: lexer
|
||||||
|
|||||||
61
src/libpostal_data
Executable file
61
src/libpostal_data
Executable file
@@ -0,0 +1,61 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
if [ "$#" -ne 2 ]; then
|
||||||
|
echo "Usage: ./libpostal_data [upload|download] data_dir"
|
||||||
|
fi
|
||||||
|
|
||||||
|
LIBPOSTAL_S3_BUCKET_NAME="libpostal"
|
||||||
|
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"
|
||||||
|
LIBPOSTAL_S3_BUCKET_URL="http://$LIBPOSTAL_S3_BUCKET_NAME.s3.amazonaws.com"
|
||||||
|
LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
|
||||||
|
|
||||||
|
|
||||||
|
COMMAND=$1
|
||||||
|
LIBPOSTAL_DATA_DIR=$2
|
||||||
|
|
||||||
|
LIBPOSTAL_DATA_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated
|
||||||
|
|
||||||
|
BASIC_MODULE_DIRS=(address_expansins numex transliteration)
|
||||||
|
|
||||||
|
EPOCH_DATE="Jan 1 00:00:00 1970"
|
||||||
|
|
||||||
|
download_file() {
|
||||||
|
updated_path=$1
|
||||||
|
data_dir=$2
|
||||||
|
filename=$3
|
||||||
|
name=$4
|
||||||
|
|
||||||
|
local_path=$data_dir/$filename
|
||||||
|
|
||||||
|
if [ ! -e $updated_path ]; then
|
||||||
|
echo "$EPOCH_DATE" > $updated_path;
|
||||||
|
fi;
|
||||||
|
|
||||||
|
echo "Checking for new libpostal $name..."
|
||||||
|
|
||||||
|
if [ $(curl $LIBPOSTAL_S3_BUCKET_URL/$filename -z "$(cat $updated_path)" --silent --remote-time -o $local_path -w %{http_code}) = "200" ]; then
|
||||||
|
echo "New libpostal $name available"
|
||||||
|
if date -r . >/dev/null 2>&1; then
|
||||||
|
echo $(date -d "$(date -d "@$(date -r $local_path +%s)") + 1 second") > $updated_path;
|
||||||
|
elif stat -f %Sm . >/dev/null 2>&1; then
|
||||||
|
echo $(date -r $(stat -f %m $local_path) -v+1S) > $updated_path;
|
||||||
|
fi;
|
||||||
|
tar -xvzf $local_path -C $data_dir;
|
||||||
|
rm $local_path;
|
||||||
|
else
|
||||||
|
echo "libpostal $name up to date"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ $COMMAND = "download" ]; then
|
||||||
|
mkdir -p $LIBPOSTAL_DATA_DIR
|
||||||
|
|
||||||
|
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE "data file"
|
||||||
|
|
||||||
|
elif [ $COMMAND = "upload" ]; then
|
||||||
|
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE ${BASIC_MODULE_DIRS[*]}
|
||||||
|
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY
|
||||||
|
else
|
||||||
|
echo "Invalid command: $COMMAND"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
Reference in New Issue
Block a user