Merge pull request #406 from openvenues/new_build_process
New build process
This commit is contained in:
38
.travis.yml
38
.travis.yml
@@ -13,6 +13,10 @@ env:
|
|||||||
- DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/.*.txt\|src/gazetteer_data.c" | wc -l)
|
- DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/.*.txt\|src/gazetteer_data.c" | wc -l)
|
||||||
- NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex\|src/numex_table_builder.c" | wc -l)
|
- NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex\|src/numex_table_builder.c" | wc -l)
|
||||||
- TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l)
|
- TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l)
|
||||||
|
- TAG_VERSION=$(cat ./versions/base).$TRAVIS_BUILD_NUMBER
|
||||||
|
- SRC_TARBALL_FILENAME=libpostal-$(cat ./versions/base).tar.gz
|
||||||
|
- LIBPOSTAL_DATA_DIR=$(pwd)/data
|
||||||
|
- LIBPOSTAL_DATA_FILENAME=libpostal_data.tar.gz
|
||||||
compiler:
|
compiler:
|
||||||
- clang
|
- clang
|
||||||
- gcc
|
- gcc
|
||||||
@@ -31,23 +35,49 @@ before_script:
|
|||||||
install:
|
install:
|
||||||
- if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi
|
- if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi
|
||||||
script:
|
script:
|
||||||
- ./configure --datadir=$(pwd)/data
|
- ./configure --datadir=$LIBPOSTAL_DATA_DIR
|
||||||
- make -j4
|
- make -j4
|
||||||
- if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi;
|
- if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi;
|
||||||
- if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi;
|
- if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi;
|
||||||
- if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi;
|
- if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi;
|
||||||
- make check
|
- make check
|
||||||
|
|
||||||
after_success:
|
after_success:
|
||||||
- |
|
- |
|
||||||
if [[ "$CC" == gcc* && "$TRAVIS_PULL_REQUEST" = "false" && "$TRAVIS_BRANCH" = "master" && ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 ) ]]; then
|
if [[ "$CC" == "gcc" && "$TRAVIS_PULL_REQUEST" = "false" && "$TRAVIS_BRANCH" = "master" ]]; then
|
||||||
env/bin/pip install awscli;
|
if [[ ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 ) ]]; then
|
||||||
export PATH=$PATH:env/bin/;
|
export PATH=$PATH:env/bin/;
|
||||||
./src/libpostal_data upload base $(pwd)/data/libpostal;
|
|
||||||
git clone -b master "https://${GH_TOKEN}@${GH_REF}" _travis > /dev/null 2>&1 || exit 1
|
git clone -b master "https://${GH_TOKEN}@${GH_REF}" _travis > /dev/null 2>&1 || exit 1
|
||||||
cp src/*_data.c _travis/src
|
cp src/*_data.c _travis/src
|
||||||
|
echo "$TAG_VERSION" > _travis/versions/base_data
|
||||||
cd _travis
|
cd _travis
|
||||||
git config user.name "$GIT_COMMITTER_NAME";
|
git config user.name "$GIT_COMMITTER_NAME";
|
||||||
git config user.email "$GIT_COMMITTER_EMAIL";
|
git config user.email "$GIT_COMMITTER_EMAIL";
|
||||||
git commit -a -m "[auto][ci skip] Adding data files from Travis build #$TRAVIS_BUILD_NUMBER";
|
git commit -a -m "[auto][ci skip] Adding data files from Travis build #$TRAVIS_BUILD_NUMBER";
|
||||||
git push --quiet origin master;
|
git push --quiet origin master;
|
||||||
|
|
||||||
|
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME $BASIC_MODULE_DIRS
|
||||||
|
fi
|
||||||
|
git tag $TAG_VERSION -a -m "[auto][ci skip] Generating tag for Travis build #$TRAVIS_BUILD_NUMBER";
|
||||||
|
git push --tags --quiet origin master;
|
||||||
fi;
|
fi;
|
||||||
|
|
||||||
|
before_deploy:
|
||||||
|
- make dist
|
||||||
|
|
||||||
|
deploy:
|
||||||
|
- provider: releases
|
||||||
|
file:
|
||||||
|
- "$SRC_TARBALL_FILENAME"
|
||||||
|
on:
|
||||||
|
tags: true
|
||||||
|
branch: master
|
||||||
|
skip_cleanup: true
|
||||||
|
- provider: releases
|
||||||
|
file:
|
||||||
|
- "$LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME"
|
||||||
|
on:
|
||||||
|
tags: true
|
||||||
|
branch: master
|
||||||
|
condition: "$CC = gcc && ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 )"
|
||||||
|
skip_cleanup: true
|
||||||
|
|||||||
16
configure.ac
16
configure.ac
@@ -2,7 +2,7 @@
|
|||||||
# Process this file with autoconf to produce a configure script.
|
# Process this file with autoconf to produce a configure script.
|
||||||
|
|
||||||
m4_define(LIBPOSTAL_MAJOR_VERSION, [1])
|
m4_define(LIBPOSTAL_MAJOR_VERSION, [1])
|
||||||
m4_define(LIBPOSTAL_MINOR_VERSION, [0])
|
m4_define(LIBPOSTAL_MINOR_VERSION, [1])
|
||||||
m4_define(LIBPOSTAL_PATCH_VERSION, [0])
|
m4_define(LIBPOSTAL_PATCH_VERSION, [0])
|
||||||
|
|
||||||
AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION)
|
AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION)
|
||||||
@@ -50,10 +50,21 @@ AC_CHECK_TYPES([ptrdiff_t])
|
|||||||
# Checks for library functions.
|
# Checks for library functions.
|
||||||
AC_CHECK_FUNCS([malloc realloc drand48 getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup])
|
AC_CHECK_FUNCS([malloc realloc drand48 getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup])
|
||||||
|
|
||||||
|
AC_SUBST([LIBPOSTAL_DATA_DIR_VERSION_STRING], [v1])
|
||||||
|
|
||||||
|
DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/base_data)
|
||||||
|
PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/parser)
|
||||||
|
LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/language_classifier)
|
||||||
|
|
||||||
|
AC_SUBST([LIBPOSTAL_DATA_FILE_LATEST_VERSION], [$DATA_FILE_LATEST_VERSION])
|
||||||
|
AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION])
|
||||||
|
AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION])
|
||||||
|
|
||||||
AC_CONFIG_FILES([Makefile
|
AC_CONFIG_FILES([Makefile
|
||||||
libpostal.pc
|
libpostal.pc
|
||||||
src/Makefile
|
src/Makefile
|
||||||
test/Makefile])
|
src/libpostal_data
|
||||||
|
test/Makefile], [chmod +x src/libpostal_data])
|
||||||
|
|
||||||
AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes])
|
AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes])
|
||||||
AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes])
|
AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes])
|
||||||
@@ -85,6 +96,7 @@ AC_ARG_ENABLE([data-download],
|
|||||||
*) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;;
|
*) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;;
|
||||||
esac], [DOWNLOAD_DATA=true])
|
esac], [DOWNLOAD_DATA=true])
|
||||||
|
|
||||||
|
|
||||||
AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"])
|
AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"])
|
||||||
|
|
||||||
AC_ARG_WITH(cflags-scanner-extra, [AS_HELP_STRING([--with-cflags-scanner-extra@<:@=VALUE@:>@], [Extra compilation options for scanner.c])],
|
AC_ARG_WITH(cflags-scanner-extra, [AS_HELP_STRING([--with-cflags-scanner-extra@<:@=VALUE@:>@], [Extra compilation options for scanner.c])],
|
||||||
|
|||||||
@@ -1,232 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [ "$#" -lt 3 ]; then
|
|
||||||
echo "Usage: ./libpostal_data [upload|download] [base|parser|language_classifier|all] data_dir"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
LIBPOSTAL_VERSION_STRING="v1"
|
|
||||||
LIBPOSTAL_RELEASE_VERSION_STRING="v1.0.0"
|
|
||||||
|
|
||||||
LIBPOSTAL_REPO_NAME="openvenues/libpostal"
|
|
||||||
LIBPOSTAL_S3_BUCKET_NAME="libpostal"
|
|
||||||
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"
|
|
||||||
|
|
||||||
GITHUB_API_URL="https://api.github.com"
|
|
||||||
LIBPOSTAL_RELEASE_API_URL="$GITHUB_API_URL/repos/$LIBPOSTAL_REPO_NAME/releases"
|
|
||||||
|
|
||||||
LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
|
|
||||||
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
|
|
||||||
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
|
|
||||||
|
|
||||||
LIBPOSTAL_DATA_S3_PREFIX="$LIBPOSTAL_LATEST_MODEL_VERSION_STRING/parser.tar.gz"
|
|
||||||
LIBPOSTAL_PARSER_S3_PREFIX="$LIBPOSTAL_LATEST_DATA_VERSION_STRING/libpostal_data.tar.gz"
|
|
||||||
LIBPOSTAL_LANG_CLASS_S3_PREFIX="$LIBPOSTAL_LATEST_MODEL_VERSION_STRING/language_classifier.tar.gz"
|
|
||||||
|
|
||||||
COMMAND=$1
|
|
||||||
FILE=$2
|
|
||||||
LIBPOSTAL_DATA_DIR=$3
|
|
||||||
|
|
||||||
LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version
|
|
||||||
LIBPOSTAL_DATA_DIR_VERSION=
|
|
||||||
|
|
||||||
mkdir -p $LIBPOSTAL_DATA_DIR
|
|
||||||
|
|
||||||
LIBPOSTAL_DATA_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated
|
|
||||||
LIBPOSTAL_PARSER_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_parser
|
|
||||||
LIBPOSTAL_LANG_CLASS_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_language_classifier
|
|
||||||
|
|
||||||
BASIC_MODULE_DIRS="address_expansions numex transliteration"
|
|
||||||
PARSER_MODULE_DIR=address_parser
|
|
||||||
LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier
|
|
||||||
|
|
||||||
export LC_ALL=C
|
|
||||||
|
|
||||||
EPOCH_DATE="Jan 1 00:00:00 1970"
|
|
||||||
|
|
||||||
MB=$((1024*1024))
|
|
||||||
CHUNK_SIZE=$((64*$MB))
|
|
||||||
|
|
||||||
LARGE_FILE_SIZE=$((CHUNK_SIZE*2))
|
|
||||||
|
|
||||||
|
|
||||||
NUM_WORKERS=12
|
|
||||||
|
|
||||||
kill_background_processes() {
|
|
||||||
jobs -p | xargs kill;
|
|
||||||
exit
|
|
||||||
}
|
|
||||||
|
|
||||||
trap kill_background_processes INT
|
|
||||||
|
|
||||||
PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"'
|
|
||||||
PART_CURL='curl -L $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5'
|
|
||||||
DOWNLOAD_PART="$PART_MSG;$PART_CURL"
|
|
||||||
|
|
||||||
|
|
||||||
download_multipart() {
|
|
||||||
url=$1
|
|
||||||
filename=$2
|
|
||||||
size=$3
|
|
||||||
|
|
||||||
num_chunks=$((size/CHUNK_SIZE))
|
|
||||||
echo "Downloading multipart: $url, size=$size, num_chunks=$num_chunks"
|
|
||||||
offset=0
|
|
||||||
i=0
|
|
||||||
while [ $i -lt $num_chunks ]; do
|
|
||||||
i=$((i+1))
|
|
||||||
part_filename="$filename.$i"
|
|
||||||
if [ $i -lt $num_chunks ]; then
|
|
||||||
max=$((offset+CHUNK_SIZE-1));
|
|
||||||
else
|
|
||||||
max=$size;
|
|
||||||
fi;
|
|
||||||
printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename"
|
|
||||||
offset=$((offset+CHUNK_SIZE))
|
|
||||||
done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" --
|
|
||||||
|
|
||||||
> $local_path
|
|
||||||
|
|
||||||
i=0
|
|
||||||
while [ $i -lt $num_chunks ]; do
|
|
||||||
i=$((i+1))
|
|
||||||
part_filename="$filename.$i"
|
|
||||||
cat $part_filename >> $local_path
|
|
||||||
rm $part_filename
|
|
||||||
done;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
download_file() {
|
|
||||||
updated_path=$1
|
|
||||||
data_dir=$2
|
|
||||||
metadata_url=$3
|
|
||||||
url=$4
|
|
||||||
size=$5
|
|
||||||
filename=$6
|
|
||||||
name=$7
|
|
||||||
shift 7
|
|
||||||
subdirs=$@
|
|
||||||
|
|
||||||
local_path=$data_dir/$filename
|
|
||||||
|
|
||||||
if [ ! -e $updated_path ]; then
|
|
||||||
echo "$EPOCH_DATE" > $updated_path;
|
|
||||||
fi;
|
|
||||||
|
|
||||||
echo "Checking for new libpostal $name..."
|
|
||||||
|
|
||||||
if [ $(curl -LsI $metadata_url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then
|
|
||||||
echo "New libpostal $name available"
|
|
||||||
|
|
||||||
if [ $size -ge $LARGE_FILE_SIZE ]; then
|
|
||||||
download_multipart $url $local_path $size
|
|
||||||
else
|
|
||||||
curl -L $url --retry 3 --retry-delay 2 -o $local_path
|
|
||||||
fi
|
|
||||||
|
|
||||||
if date -ud "@$(date -ur . +%s)" >/dev/null 2>&1; then
|
|
||||||
echo $(date -ud "$(date -ud "@$(date -ur $local_path +%s)") + 1 second") > $updated_path;
|
|
||||||
elif stat -f %Sm . >/dev/null 2>&1; then
|
|
||||||
echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path;
|
|
||||||
fi;
|
|
||||||
for subdir in $subdirs; do
|
|
||||||
rm -rf $data_dir/$subdir;
|
|
||||||
done
|
|
||||||
tar -xvzf $local_path --no-same-owner -C $data_dir;
|
|
||||||
rm $local_path;
|
|
||||||
else
|
|
||||||
echo "libpostal $name up to date"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
if [ $COMMAND = "download" ]; then
|
|
||||||
if [ -e $LIBPOSTAL_DATA_VERSION_FILE ]; then
|
|
||||||
LIBPOSTAL_DATA_DIR_VERSION=$(cat $LIBPOSTAL_DATA_VERSION_FILE)
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then
|
|
||||||
echo "Old version of datadir detected, removing..."
|
|
||||||
for subdir in $BASIC_MODULE_DIRS $PARSER_MODULE_DIR $LANGUAGE_CLASSIFIER_MODULE_DIR; do
|
|
||||||
rm -rf $LIBPOSTAL_DATA_DIR/$subdir;
|
|
||||||
done
|
|
||||||
|
|
||||||
# Legacy, blow it away too to be nice
|
|
||||||
if [ -e $LIBPOSTAL_DATA_DIR/geodb ]; then
|
|
||||||
rm -rf $LIBPOSTAL_DATA_DIR/geodb;
|
|
||||||
fi
|
|
||||||
|
|
||||||
rm -f $LIBPOSTAL_DATA_UPDATED_PATH
|
|
||||||
rm -f $LIBPOSTAL_LANG_CLASS_UPDATED_PATH
|
|
||||||
rm -f $LIBPOSTAL_PARSER_UPDATED_PATH
|
|
||||||
fi
|
|
||||||
|
|
||||||
mkdir -p $LIBPOSTAL_DATA_DIR
|
|
||||||
|
|
||||||
release_id=$(curl -s $LIBPOSTAL_RELEASE_API_URL/tags/$LIBPOSTAL_RELEASE_VERSION_STRING | grep "\"id\"" | head -n1 | grep -o '[0-9][0-9]*')
|
|
||||||
release_assets="$(curl -s $LIBPOSTAL_RELEASE_API_URL/$release_id/assets)"
|
|
||||||
|
|
||||||
asset_names_tempfile="$LIBPOSTAL_DATA_DIR/asset_names.tmp"
|
|
||||||
echo "$release_assets" | grep -o '"name": *"[^"]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_names_tempfile
|
|
||||||
asset_metadata_tempfile="$LIBPOSTAL_DATA_DIR/asset_metadata.tmp"
|
|
||||||
echo "$release_assets" | grep -o '"url": *"[^"]*/releases/assets/[0-9]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_metadata_tempfile
|
|
||||||
asset_urls_tempfile="$LIBPOSTAL_DATA_DIR/asset_urls.tmp"
|
|
||||||
echo "$release_assets" | grep -o '"browser_download_url": *"[^"]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_urls_tempfile
|
|
||||||
asset_sizes_tempfile="$LIBPOSTAL_DATA_DIR/asset_sizes.tmp"
|
|
||||||
echo "$release_assets" | grep -o '"size": *[0-9]*' | grep -o '[0-9]*$' > $asset_sizes_tempfile
|
|
||||||
|
|
||||||
assets_tempfile="$LIBPOSTAL_DATA_DIR/assets.tmp"
|
|
||||||
paste -d' ' $asset_names_tempfile $asset_metadata_tempfile $asset_urls_tempfile $asset_sizes_tempfile > $assets_tempfile
|
|
||||||
|
|
||||||
rm $asset_names_tempfile $asset_metadata_tempfile $asset_urls_tempfile $asset_sizes_tempfile
|
|
||||||
|
|
||||||
while read -r line; do
|
|
||||||
asset=$(echo "$line" | cut -f1 -d' ')
|
|
||||||
asset_metadata_url=$(echo "$line" | cut -f2 -d' ')
|
|
||||||
asset_url=$(echo "$line" | cut -f3 -d' ')
|
|
||||||
asset_size=$(echo "$line" | cut -f4 -d' ')
|
|
||||||
|
|
||||||
if [ $asset = $LIBPOSTAL_DATA_FILE ] && ([ $FILE = "base" ] || [ $FILE = "all" ]); then
|
|
||||||
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS
|
|
||||||
fi
|
|
||||||
if [ $asset = $LIBPOSTAL_PARSER_FILE ] && ([ $FILE = "parser" ] || [ $FILE = "all" ]); then
|
|
||||||
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR
|
|
||||||
fi
|
|
||||||
if [ $asset = $LIBPOSTAL_LANG_CLASS_FILE ] && ([ $FILE = "language_classifier" ] || [ $FILE = "all" ]); then
|
|
||||||
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then
|
|
||||||
echo $LIBPOSTAL_VERSION_STRING > $LIBPOSTAL_DATA_VERSION_FILE;
|
|
||||||
fi
|
|
||||||
done < $assets_tempfile;
|
|
||||||
rm $assets_tempfile
|
|
||||||
|
|
||||||
elif [ $COMMAND = "upload" ]; then
|
|
||||||
echo "upload not implemented yet"
|
|
||||||
|
|
||||||
#if [ $FILE = "base" ] || [ $FILE = "all" ]; then
|
|
||||||
# tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS
|
|
||||||
# aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY/$LIBPOSTAL_DATA_S3_PREFIX/
|
|
||||||
#fi
|
|
||||||
|
|
||||||
#if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
|
||||||
# latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest)
|
|
||||||
# tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR
|
|
||||||
# parser_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/"
|
|
||||||
# aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir
|
|
||||||
#fi
|
|
||||||
|
|
||||||
#if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
|
|
||||||
# latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest)
|
|
||||||
# tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR
|
|
||||||
# lang_class_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class/"
|
|
||||||
# aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir
|
|
||||||
#fi
|
|
||||||
else
|
|
||||||
echo "Invalid command: $COMMAND"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
173
src/libpostal_data.in
Executable file
173
src/libpostal_data.in
Executable file
@@ -0,0 +1,173 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [ "$#" -lt 3 ]; then
|
||||||
|
echo "Usage: ./libpostal_data [upload|download] [base|parser|language_classifier|all] data_dir"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
COMMAND=$1
|
||||||
|
FILE=$2
|
||||||
|
LIBPOSTAL_DATA_DIR=$3
|
||||||
|
|
||||||
|
MB=$((1024*1024))
|
||||||
|
CHUNK_SIZE=$((64*$MB))
|
||||||
|
|
||||||
|
# Not loving this approach but there appears to be no way to query the size
|
||||||
|
# of a release asset without using the Github API
|
||||||
|
LIBPOSTAL_DATA_FILE_CHUNKS=1
|
||||||
|
LIBPOSTAL_PARSER_MODEL_CHUNKS=12
|
||||||
|
LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1
|
||||||
|
|
||||||
|
LIBPOSTAL_DATA_DIR_VERSION_STRING="@LIBPOSTAL_DATA_DIR_VERSION_STRING@"
|
||||||
|
|
||||||
|
LIBPOSTAL_DATA_FILE_LATEST_VERSION="@LIBPOSTAL_DATA_FILE_LATEST_VERSION@"
|
||||||
|
LIBPOSTAL_PARSER_MODEL_LATEST_VERSION="@LIBPOSTAL_PARSER_MODEL_LATEST_VERSION@"
|
||||||
|
LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION="@LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION@"
|
||||||
|
|
||||||
|
LIBPOSTAL_REPO_NAME="openvenues/libpostal"
|
||||||
|
|
||||||
|
LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
|
||||||
|
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
|
||||||
|
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
|
||||||
|
|
||||||
|
LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download"
|
||||||
|
|
||||||
|
LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version
|
||||||
|
LIBPOSTAL_DATA_DIR_VERSION=
|
||||||
|
|
||||||
|
mkdir -p $LIBPOSTAL_DATA_DIR
|
||||||
|
|
||||||
|
LIBPOSTAL_DATA_FILE_VERSION_PATH=$LIBPOSTAL_DATA_DIR/base_data_file_version
|
||||||
|
LIBPOSTAL_PARSER_MODEL_VERSION_PATH=$LIBPOSTAL_DATA_DIR/parser_model_file_version
|
||||||
|
LIBPOSTAL_LANG_CLASS_MODEL_VERSION_PATH=$LIBPOSTAL_DATA_DIR/language_classifier_model_file_version
|
||||||
|
|
||||||
|
BASIC_MODULE_DIRS="address_expansions numex transliteration"
|
||||||
|
PARSER_MODULE_DIR=address_parser
|
||||||
|
LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier
|
||||||
|
|
||||||
|
NUM_WORKERS=12
|
||||||
|
|
||||||
|
kill_background_processes() {
|
||||||
|
jobs -p | xargs kill;
|
||||||
|
exit
|
||||||
|
}
|
||||||
|
|
||||||
|
trap kill_background_processes INT
|
||||||
|
|
||||||
|
PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"'
|
||||||
|
PART_CURL='curl -L $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5'
|
||||||
|
DOWNLOAD_PART="$PART_MSG;$PART_CURL"
|
||||||
|
|
||||||
|
|
||||||
|
download_release_multipart() {
|
||||||
|
url=$1
|
||||||
|
filename=$2
|
||||||
|
num_chunks=$3
|
||||||
|
|
||||||
|
echo "Downloading multipart: $url, num_chunks=$num_chunks"
|
||||||
|
offset=0
|
||||||
|
i=0
|
||||||
|
while [ $i -lt $num_chunks ]; do
|
||||||
|
i=$((i+1))
|
||||||
|
part_filename="$filename.$i"
|
||||||
|
max=$((offset+CHUNK_SIZE-1));
|
||||||
|
printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename"
|
||||||
|
offset=$((offset+CHUNK_SIZE))
|
||||||
|
done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" --
|
||||||
|
|
||||||
|
> $filename
|
||||||
|
|
||||||
|
i=0
|
||||||
|
while [ $i -lt $num_chunks ]; do
|
||||||
|
i=$((i+1))
|
||||||
|
part_filename="$filename.$i"
|
||||||
|
cat $part_filename >> $filename
|
||||||
|
rm $part_filename
|
||||||
|
done;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
download_release() {
|
||||||
|
version_file_path=$1
|
||||||
|
version=$2
|
||||||
|
data_dir=$3
|
||||||
|
num_chunks=$4
|
||||||
|
filename=$5
|
||||||
|
name=$6
|
||||||
|
shift 6
|
||||||
|
subdirs=$@
|
||||||
|
|
||||||
|
local_path=$data_dir/$filename
|
||||||
|
|
||||||
|
url=$LIBPOSTAL_BASE_URL/$version/$filename
|
||||||
|
|
||||||
|
if [ ! -e $version_file_path ]; then
|
||||||
|
current_version=""
|
||||||
|
else
|
||||||
|
current_version="$(cat $version_file_path)"
|
||||||
|
|
||||||
|
fi;
|
||||||
|
|
||||||
|
echo "Checking for new libpostal $name..."
|
||||||
|
|
||||||
|
if [ "$current_version" != "$version" ]; then
|
||||||
|
echo "New libpostal $name available"
|
||||||
|
|
||||||
|
if [ $num_chunks -gt 1 ]; then
|
||||||
|
download_release_multipart $url $local_path $num_chunks
|
||||||
|
else
|
||||||
|
curl -L $url --retry 3 --retry-delay 2 -o $local_path
|
||||||
|
fi
|
||||||
|
|
||||||
|
for subdir in $subdirs; do
|
||||||
|
rm -rf $data_dir/$subdir;
|
||||||
|
done
|
||||||
|
tar -xvzf $local_path --no-same-owner -C $data_dir;
|
||||||
|
rm $local_path;
|
||||||
|
echo "$version" > $version_file_path;
|
||||||
|
else
|
||||||
|
echo "libpostal $name up to date"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ $COMMAND = "download" ]; then
|
||||||
|
if [ -e $LIBPOSTAL_DATA_VERSION_FILE ]; then
|
||||||
|
LIBPOSTAL_DATA_DIR_VERSION=$(cat $LIBPOSTAL_DATA_VERSION_FILE)
|
||||||
|
|
||||||
|
if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_DATA_DIR_VERSION_STRING" ]; then
|
||||||
|
echo "Old version of datadir detected, removing..."
|
||||||
|
for subdir in $BASIC_MODULE_DIRS $PARSER_MODULE_DIR $LANGUAGE_CLASSIFIER_MODULE_DIR; do
|
||||||
|
rm -rf $LIBPOSTAL_DATA_DIR/$subdir;
|
||||||
|
done
|
||||||
|
|
||||||
|
# Legacy, blow it away too to be nice
|
||||||
|
if [ -e $LIBPOSTAL_DATA_DIR/geodb ]; then
|
||||||
|
rm -rf $LIBPOSTAL_DATA_DIR/geodb;
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f $LIBPOSTAL_DATA_DIR/last_updated*
|
||||||
|
rm -f $LIBPOSTAL_DATA_DIR/*_version
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p $LIBPOSTAL_DATA_DIR
|
||||||
|
|
||||||
|
if ([ $FILE = "base" ] || [ $FILE = "all" ]); then
|
||||||
|
download_release $LIBPOSTAL_DATA_FILE_VERSION_PATH $LIBPOSTAL_DATA_FILE_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE_CHUNKS $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS
|
||||||
|
fi
|
||||||
|
if ([ $FILE = "parser" ] || [ $FILE = "all" ]); then
|
||||||
|
download_release $LIBPOSTAL_PARSER_MODEL_VERSION_PATH $LIBPOSTAL_PARSER_MODEL_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_PARSER_MODEL_CHUNKS $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR
|
||||||
|
fi
|
||||||
|
if ([ $FILE = "language_classifier" ] || [ $FILE = "all" ]); then
|
||||||
|
download_release $LIBPOSTAL_LANG_CLASS_MODEL_VERSION_PATH $LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "$LIBPOSTAL_DATA_DIR_VERSION_STRING" > $LIBPOSTAL_DATA_VERSION_FILE
|
||||||
|
|
||||||
|
else
|
||||||
|
echo "Invalid command: $COMMAND"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
1
versions/base_data
Normal file
1
versions/base_data
Normal file
@@ -0,0 +1 @@
|
|||||||
|
v1.0.0
|
||||||
1
versions/language_classifier
Normal file
1
versions/language_classifier
Normal file
@@ -0,0 +1 @@
|
|||||||
|
v1.0.0
|
||||||
1
versions/parser
Normal file
1
versions/parser
Normal file
@@ -0,0 +1 @@
|
|||||||
|
v1.0.0
|
||||||
@@ -2,7 +2,7 @@
|
|||||||
# Process this file with autoconf to produce a configure script.
|
# Process this file with autoconf to produce a configure script.
|
||||||
|
|
||||||
m4_define(LIBPOSTAL_MAJOR_VERSION, [1])
|
m4_define(LIBPOSTAL_MAJOR_VERSION, [1])
|
||||||
m4_define(LIBPOSTAL_MINOR_VERSION, [0])
|
m4_define(LIBPOSTAL_MINOR_VERSION, [1])
|
||||||
m4_define(LIBPOSTAL_PATCH_VERSION, [0])
|
m4_define(LIBPOSTAL_PATCH_VERSION, [0])
|
||||||
|
|
||||||
AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION)
|
AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION)
|
||||||
@@ -50,10 +50,21 @@ AC_CHECK_TYPES([ptrdiff_t])
|
|||||||
# Checks for library functions.
|
# Checks for library functions.
|
||||||
AC_CHECK_FUNCS([malloc realloc drand48 getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup])
|
AC_CHECK_FUNCS([malloc realloc drand48 getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup])
|
||||||
|
|
||||||
|
AC_SUBST([LIBPOSTAL_DATA_DIR_VERSION_STRING], [v1])
|
||||||
|
|
||||||
|
DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/base_data)
|
||||||
|
PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/parser)
|
||||||
|
LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/language_classifier)
|
||||||
|
|
||||||
|
AC_SUBST([LIBPOSTAL_DATA_FILE_LATEST_VERSION], [$DATA_FILE_LATEST_VERSION])
|
||||||
|
AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION])
|
||||||
|
AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION])
|
||||||
|
|
||||||
AC_CONFIG_FILES([Makefile
|
AC_CONFIG_FILES([Makefile
|
||||||
libpostal.pc
|
libpostal.pc
|
||||||
src/Makefile
|
src/Makefile
|
||||||
test/Makefile])
|
src/libpostal_data
|
||||||
|
test/Makefile], [chmod +x src/libpostal_data])
|
||||||
|
|
||||||
AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes])
|
AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes])
|
||||||
AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes])
|
AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes])
|
||||||
|
|||||||
Reference in New Issue
Block a user