From b94519122ef0a0f934dccae7aaaf5f6128356d21 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 18 Nov 2018 11:44:28 -0500 Subject: [PATCH 01/11] [build] adding new in-repo version files for each of the data files so we don't have to use the Github API on Travis builds and a template version of the download script so that the version numbers can be configured based on those files. --- src/libpostal_data.in | 205 +++++++++++++++++++++++++++++++++++ versions/base | 1 + versions/base_data | 1 + versions/language_classifier | 1 + versions/parser | 1 + 5 files changed, 209 insertions(+) create mode 100755 src/libpostal_data.in create mode 100644 versions/base create mode 100644 versions/base_data create mode 100644 versions/language_classifier create mode 100644 versions/parser diff --git a/src/libpostal_data.in b/src/libpostal_data.in new file mode 100755 index 00000000..882ae0cf --- /dev/null +++ b/src/libpostal_data.in @@ -0,0 +1,205 @@ +#!/bin/sh + +set -e + +if [ "$#" -lt 3 ]; then + echo "Usage: ./libpostal_data [upload|download] [base|parser|language_classifier|all] data_dir" + exit 1 +fi + +COMMAND=$1 +FILE=$2 +LIBPOSTAL_DATA_DIR=$3 + +MB=$((1024*1024)) +CHUNK_SIZE=$((64*$MB)) + +# Not loving this approach but there appears to be no way to query the size +# of a release asset without using the Github API +LIBPOSTAL_DATA_FILE_CHUNKS=1 +LIBPOSTAL_PARSER_MODEL_CHUNKS=12 +LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS=1 + +LIBPOSTAL_DATA_DIR_VERSION_STRING="@LIBPOSTAL_DATA_DIR_VERSION_STRING@" + +LIBPOSTAL_DATA_FILE_LATEST_VERSION="@LIBPOSTAL_DATA_FILE_LATEST_VERSION@" +LIBPOSTAL_PARSER_MODEL_LATEST_VERSION="@LIBPOSTAL_PARSER_MODEL_LATEST_VERSION@" +LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION="@LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION@" + +LIBPOSTAL_REPO_NAME="openvenues/libpostal" + +LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz" +LIBPOSTAL_PARSER_FILE="parser.tar.gz" +LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz" + +LIBPOSTAL_BASE_URL="https://github.com/$LIBPOSTAL_REPO_NAME/releases/download" + +LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version +LIBPOSTAL_DATA_DIR_VERSION= + +mkdir -p $LIBPOSTAL_DATA_DIR + +LIBPOSTAL_DATA_FILE_VERSION_PATH=$LIBPOSTAL_DATA_DIR/base_data_file_version +LIBPOSTAL_PARSER_MODEL_VERSION_PATH=$LIBPOSTAL_DATA_DIR/parser_model_file_version +LIBPOSTAL_LANG_CLASS_MODEL_VERSION_PATH=$LIBPOSTAL_DATA_DIR/language_classifier_model_file_version + +BASIC_MODULE_DIRS="address_expansions numex transliteration" +PARSER_MODULE_DIR=address_parser +LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier + + +kill_background_processes() { + jobs -p | xargs kill; + exit +} + +trap kill_background_processes INT + +PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"' +PART_CURL='curl -L $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5' +DOWNLOAD_PART="$PART_MSG;$PART_CURL" + + +download_release_multipart() { + url=$1 + filename=$2 + num_chunks=$3 + + echo "Downloading multipart: $url, num_chunks=$num_chunks" + offset=0 + i=0 + while [ $i -lt $num_chunks ]; do + i=$((i+1)) + part_filename="$filename.$i" + if [ $i -lt $num_chunks ]; then + max=$((offset+CHUNK_SIZE-1)); + else + max=""; + fi; + printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename" + offset=$((offset+CHUNK_SIZE)) + done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" -- + + > $local_path + + i=0 + while [ $i -lt $num_chunks ]; do + i=$((i+1)) + part_filename="$filename.$i" + cat $part_filename >> $local_path + rm $part_filename + done; + +} + + +download_release() { + version_file_path=$1 + version=$2 + data_dir=$3 + num_chunks=$4 + filename=$5 + name=$6 + shift 6 + subdirs=$@ + + echo $version_file_path + echo $version + echo $data_dir + echo $num_chunks + echo $filename + echo $name + echo $subdirs + + local_path=$data_dir/$filename + + url=$LIBPOSTAL_BASE_URL/$version/$filename + + if [ ! -e $version_file_path ]; then + current_version="" + else + current_version="$(cat $version_path)" + fi; + + echo "Checking for new libpostal $name..." + + if [ $current_version -ne $version ]; then + echo "New libpostal $name available" + + if [ $num_chunks -gt 1 ]; then + download_release_multipart $url $local_path $num_chunks + else + curl -L $url --retry 3 --retry-delay 2 -o $local_path + fi + + if date -ud "@$(date -ur . +%s)" >/dev/null 2>&1; then + echo $(date -ud "$(date -ud "@$(date -ur $local_path +%s)") + 1 second") > $updated_path; + elif stat -f %Sm . >/dev/null 2>&1; then + echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path; + fi; + for subdir in $subdirs; do + rm -rf $data_dir/$subdir; + done + tar -xvzf $local_path --no-same-owner -C $data_dir; + rm $local_path; + else + echo "libpostal $name up to date" + fi +} + +if [ $COMMAND = "download" ]; then + if [ -e $LIBPOSTAL_DATA_VERSION_FILE ]; then + LIBPOSTAL_DATA_DIR_VERSION=$(cat $LIBPOSTAL_DATA_VERSION_FILE) + fi + + if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_DATA_DIR_VERSION_STRING" ]; then + echo "Old version of datadir detected, removing..." + for subdir in $BASIC_MODULE_DIRS $PARSER_MODULE_DIR $LANGUAGE_CLASSIFIER_MODULE_DIR; do + rm -rf $LIBPOSTAL_DATA_DIR/$subdir; + done + + # Legacy, blow it away too to be nice + if [ -e $LIBPOSTAL_DATA_DIR/geodb ]; then + rm -rf $LIBPOSTAL_DATA_DIR/geodb; + fi + + rm -f $LIBPOSTAL_DATA_DIR/last_updated* + fi + + mkdir -p $LIBPOSTAL_DATA_DIR + + if ([ $FILE = "base" ] || [ $FILE = "all" ]); then + download_release $LIBPOSTAL_DATA_FILE_VERSION_PATH $LIBPOSTAL_DATA_FILE_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE_CHUNKS $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS + fi + if ([ $FILE = "parser" ] || [ $FILE = "all" ]); then + download_release $LIBPOSTAL_PARSER_MODEL_VERSION_PATH $LIBPOSTAL_PARSER_MODEL_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_PARSER_MODEL_CHUNKS $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR + fi + if ([ $FILE = "language_classifier" ] || [ $FILE = "all" ]); then + download_release $LIBPOSTAL_LANG_CLASS_MODEL_VERSION_PATH $LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR + fi + +elif [ $COMMAND = "upload" ]; then + echo "upload not implemented yet" + + #if [ $FILE = "base" ] || [ $FILE = "all" ]; then + # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS + # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY/$LIBPOSTAL_DATA_S3_PREFIX/ + #fi + + #if [ $FILE = "parser" ] || [ $FILE = "all" ]; then + # latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest) + # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR + # parser_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/" + # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir + #fi + + #if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then + # latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest) + # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR + # lang_class_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class/" + # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir + #fi +else + echo "Invalid command: $COMMAND" + exit 1 +fi diff --git a/versions/base b/versions/base new file mode 100644 index 00000000..a8597845 --- /dev/null +++ b/versions/base @@ -0,0 +1 @@ +v1.1-alpha \ No newline at end of file diff --git a/versions/base_data b/versions/base_data new file mode 100644 index 00000000..60453e69 --- /dev/null +++ b/versions/base_data @@ -0,0 +1 @@ +v1.0.0 \ No newline at end of file diff --git a/versions/language_classifier b/versions/language_classifier new file mode 100644 index 00000000..60453e69 --- /dev/null +++ b/versions/language_classifier @@ -0,0 +1 @@ +v1.0.0 \ No newline at end of file diff --git a/versions/parser b/versions/parser new file mode 100644 index 00000000..0ec25f75 --- /dev/null +++ b/versions/parser @@ -0,0 +1 @@ +v1.0.0 From 641395e81104af9114f1767d7686cb0345fffdd2 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 18 Nov 2018 11:53:46 -0500 Subject: [PATCH 02/11] [build] configure now uses the in-repo version strings and builds the libpostal_data script from a template --- configure.ac | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index 0dea296c..abb6d41a 100644 --- a/configure.ac +++ b/configure.ac @@ -2,8 +2,8 @@ # Process this file with autoconf to produce a configure script. m4_define(LIBPOSTAL_MAJOR_VERSION, [1]) -m4_define(LIBPOSTAL_MINOR_VERSION, [0]) -m4_define(LIBPOSTAL_PATCH_VERSION, [0]) +m4_define(LIBPOSTAL_MINOR_VERSION, [1]) +m4_define(LIBPOSTAL_PATCH_VERSION, [alpha]) AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION) @@ -50,10 +50,21 @@ AC_CHECK_TYPES([ptrdiff_t]) # Checks for library functions. AC_CHECK_FUNCS([malloc realloc drand48 getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup]) +AC_SUBST([LIBPOSTAL_DATA_DIR_VERSION_STRING], [v1]) + +DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/base_data) +PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/parser) +LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/language_classifier) + +AC_SUBST([LIBPOSTAL_DATA_FILE_LATEST_VERSION], [$DATA_FILE_LATEST_VERSION]) +AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION]) +AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION]) + AC_CONFIG_FILES([Makefile libpostal.pc src/Makefile - test/Makefile]) + src/libpostal_data + test/Makefile], [chmod +x src/libpostal_data]) AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes]) AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes]) @@ -85,6 +96,7 @@ AC_ARG_ENABLE([data-download], *) AC_MSG_ERROR([bad value ${enableval} for --disable-data-download]) ;; esac], [DOWNLOAD_DATA=true]) + AM_CONDITIONAL([DOWNLOAD_DATA], [test "x$DOWNLOAD_DATA" = "xtrue"]) AC_ARG_WITH(cflags-scanner-extra, [AS_HELP_STRING([--with-cflags-scanner-extra@<:@=VALUE@:>@], [Extra compilation options for scanner.c])], From 9542c8c43576c522f9bc0be1634829b020ad4d2a Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 13 Dec 2018 13:35:01 -0500 Subject: [PATCH 03/11] [build] adding libposatl_data.in which allows us to fetch data files from Github without using the API --- .travis.yml | 60 ++++++++--- src/libpostal_data | 232 ------------------------------------------ src/libpostal_data.in | 21 ---- 3 files changed, 45 insertions(+), 268 deletions(-) delete mode 100755 src/libpostal_data diff --git a/.travis.yml b/.travis.yml index ab34fbd0..1f3dbacf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,16 +13,20 @@ env: - DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/.*.txt\|src/gazetteer_data.c" | wc -l) - NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex\|src/numex_table_builder.c" | wc -l) - TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l) + - TAG_VERSION=$(cat ./versions/base).$TRAVIS_BUILD_NUMBER + - SRC_TARBALL_FILENAME=libpostal-$(cat ./versions/base).tar.gz + - LIBPOSTAL_DATA_DIR=$(pwd)/data + - LIBPOSTAL_DATA_FILENAME=libpostal_data.tar.gz compiler: - clang - gcc addons: apt: sources: - - ubuntu-toolchain-r-test + - ubuntu-toolchain-r-test packages: - - gcc-4.8 - - pkg-config + - gcc-4.8 + - pkg-config before_script: - ./bootstrap.sh - if [[ $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 ]]; then git clone https://github.com/pypa/virtualenv; cd virtualenv; git checkout master; python virtualenv.py ../env; cd ..; env/bin/pip install -r scripts/requirements-simple.txt; fi; @@ -31,23 +35,49 @@ before_script: install: - if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi script: - - ./configure --datadir=$(pwd)/data + - ./configure --datadir=$LIBPOSTAL_DATA_DIR - make -j4 - if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi; - if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi; - if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi; - make check + after_success: - | - if [[ "$CC" == gcc* && "$TRAVIS_PULL_REQUEST" = "false" && "$TRAVIS_BRANCH" = "master" && ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 ) ]]; then - env/bin/pip install awscli; - export PATH=$PATH:env/bin/; - ./src/libpostal_data upload base $(pwd)/data/libpostal; - git clone -b master "https://${GH_TOKEN}@${GH_REF}" _travis > /dev/null 2>&1 || exit 1 - cp src/*_data.c _travis/src - cd _travis - git config user.name "$GIT_COMMITTER_NAME"; - git config user.email "$GIT_COMMITTER_EMAIL"; - git commit -a -m "[auto][ci skip] Adding data files from Travis build #$TRAVIS_BUILD_NUMBER"; - git push --quiet origin master; + if [[ "$CC" == "gcc" && "$TRAVIS_PULL_REQUEST" = "false" && "$TRAVIS_BRANCH" = "master" ]]; then + if [[ ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 ) ]]; then + export PATH=$PATH:env/bin/; + git clone -b master "https://${GH_TOKEN}@${GH_REF}" _travis > /dev/null 2>&1 || exit 1 + cp src/*_data.c _travis/src + echo "$TAG_VERSION" > _travis/versions/base_data + cd _travis + git config user.name "$GIT_COMMITTER_NAME"; + git config user.email "$GIT_COMMITTER_EMAIL"; + git commit -a -m "[auto][ci skip] Adding data files from Travis build #$TRAVIS_BUILD_NUMBER"; + git push --quiet origin master; + + tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME $BASIC_MODULE_DIRS + fi + git tag $TAG_VERSION -a -m "[auto][ci skip] Generating tag for Travis build #$TRAVIS_BUILD_NUMBER"; + git push --tags --quiet origin master; fi; + +before_deploy: + - make dist + +deploy: + - provider: releases + file: + - "$SRC_TARBALL_FILENAME" + on: + tags: true + branch: master + skip_cleanup: true + - provider: releases + file: + - "$LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME" + on: + tags: true + branch: master + condition: "$CC = gcc && ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 )" + skip_cleanup: true diff --git a/src/libpostal_data b/src/libpostal_data deleted file mode 100755 index 77c2dd0b..00000000 --- a/src/libpostal_data +++ /dev/null @@ -1,232 +0,0 @@ -#!/bin/sh - -set -e - -if [ "$#" -lt 3 ]; then - echo "Usage: ./libpostal_data [upload|download] [base|parser|language_classifier|all] data_dir" - exit 1 -fi - -LIBPOSTAL_VERSION_STRING="v1" -LIBPOSTAL_RELEASE_VERSION_STRING="v1.0.0" - -LIBPOSTAL_REPO_NAME="openvenues/libpostal" -LIBPOSTAL_S3_BUCKET_NAME="libpostal" -LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME" - -GITHUB_API_URL="https://api.github.com" -LIBPOSTAL_RELEASE_API_URL="$GITHUB_API_URL/repos/$LIBPOSTAL_REPO_NAME/releases" - -LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz" -LIBPOSTAL_PARSER_FILE="parser.tar.gz" -LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz" - -LIBPOSTAL_DATA_S3_PREFIX="$LIBPOSTAL_LATEST_MODEL_VERSION_STRING/parser.tar.gz" -LIBPOSTAL_PARSER_S3_PREFIX="$LIBPOSTAL_LATEST_DATA_VERSION_STRING/libpostal_data.tar.gz" -LIBPOSTAL_LANG_CLASS_S3_PREFIX="$LIBPOSTAL_LATEST_MODEL_VERSION_STRING/language_classifier.tar.gz" - -COMMAND=$1 -FILE=$2 -LIBPOSTAL_DATA_DIR=$3 - -LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version -LIBPOSTAL_DATA_DIR_VERSION= - -mkdir -p $LIBPOSTAL_DATA_DIR - -LIBPOSTAL_DATA_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated -LIBPOSTAL_PARSER_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_parser -LIBPOSTAL_LANG_CLASS_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_language_classifier - -BASIC_MODULE_DIRS="address_expansions numex transliteration" -PARSER_MODULE_DIR=address_parser -LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier - -export LC_ALL=C - -EPOCH_DATE="Jan 1 00:00:00 1970" - -MB=$((1024*1024)) -CHUNK_SIZE=$((64*$MB)) - -LARGE_FILE_SIZE=$((CHUNK_SIZE*2)) - - -NUM_WORKERS=12 - -kill_background_processes() { - jobs -p | xargs kill; - exit -} - -trap kill_background_processes INT - -PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"' -PART_CURL='curl -L $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5' -DOWNLOAD_PART="$PART_MSG;$PART_CURL" - - -download_multipart() { - url=$1 - filename=$2 - size=$3 - - num_chunks=$((size/CHUNK_SIZE)) - echo "Downloading multipart: $url, size=$size, num_chunks=$num_chunks" - offset=0 - i=0 - while [ $i -lt $num_chunks ]; do - i=$((i+1)) - part_filename="$filename.$i" - if [ $i -lt $num_chunks ]; then - max=$((offset+CHUNK_SIZE-1)); - else - max=$size; - fi; - printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename" - offset=$((offset+CHUNK_SIZE)) - done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" -- - - > $local_path - - i=0 - while [ $i -lt $num_chunks ]; do - i=$((i+1)) - part_filename="$filename.$i" - cat $part_filename >> $local_path - rm $part_filename - done; - -} - - -download_file() { - updated_path=$1 - data_dir=$2 - metadata_url=$3 - url=$4 - size=$5 - filename=$6 - name=$7 - shift 7 - subdirs=$@ - - local_path=$data_dir/$filename - - if [ ! -e $updated_path ]; then - echo "$EPOCH_DATE" > $updated_path; - fi; - - echo "Checking for new libpostal $name..." - - if [ $(curl -LsI $metadata_url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then - echo "New libpostal $name available" - - if [ $size -ge $LARGE_FILE_SIZE ]; then - download_multipart $url $local_path $size - else - curl -L $url --retry 3 --retry-delay 2 -o $local_path - fi - - if date -ud "@$(date -ur . +%s)" >/dev/null 2>&1; then - echo $(date -ud "$(date -ud "@$(date -ur $local_path +%s)") + 1 second") > $updated_path; - elif stat -f %Sm . >/dev/null 2>&1; then - echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path; - fi; - for subdir in $subdirs; do - rm -rf $data_dir/$subdir; - done - tar -xvzf $local_path --no-same-owner -C $data_dir; - rm $local_path; - else - echo "libpostal $name up to date" - fi -} - -if [ $COMMAND = "download" ]; then - if [ -e $LIBPOSTAL_DATA_VERSION_FILE ]; then - LIBPOSTAL_DATA_DIR_VERSION=$(cat $LIBPOSTAL_DATA_VERSION_FILE) - fi - - if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then - echo "Old version of datadir detected, removing..." - for subdir in $BASIC_MODULE_DIRS $PARSER_MODULE_DIR $LANGUAGE_CLASSIFIER_MODULE_DIR; do - rm -rf $LIBPOSTAL_DATA_DIR/$subdir; - done - - # Legacy, blow it away too to be nice - if [ -e $LIBPOSTAL_DATA_DIR/geodb ]; then - rm -rf $LIBPOSTAL_DATA_DIR/geodb; - fi - - rm -f $LIBPOSTAL_DATA_UPDATED_PATH - rm -f $LIBPOSTAL_LANG_CLASS_UPDATED_PATH - rm -f $LIBPOSTAL_PARSER_UPDATED_PATH - fi - - mkdir -p $LIBPOSTAL_DATA_DIR - - release_id=$(curl -s $LIBPOSTAL_RELEASE_API_URL/tags/$LIBPOSTAL_RELEASE_VERSION_STRING | grep "\"id\"" | head -n1 | grep -o '[0-9][0-9]*') - release_assets="$(curl -s $LIBPOSTAL_RELEASE_API_URL/$release_id/assets)" - - asset_names_tempfile="$LIBPOSTAL_DATA_DIR/asset_names.tmp" - echo "$release_assets" | grep -o '"name": *"[^"]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_names_tempfile - asset_metadata_tempfile="$LIBPOSTAL_DATA_DIR/asset_metadata.tmp" - echo "$release_assets" | grep -o '"url": *"[^"]*/releases/assets/[0-9]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_metadata_tempfile - asset_urls_tempfile="$LIBPOSTAL_DATA_DIR/asset_urls.tmp" - echo "$release_assets" | grep -o '"browser_download_url": *"[^"]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_urls_tempfile - asset_sizes_tempfile="$LIBPOSTAL_DATA_DIR/asset_sizes.tmp" - echo "$release_assets" | grep -o '"size": *[0-9]*' | grep -o '[0-9]*$' > $asset_sizes_tempfile - - assets_tempfile="$LIBPOSTAL_DATA_DIR/assets.tmp" - paste -d' ' $asset_names_tempfile $asset_metadata_tempfile $asset_urls_tempfile $asset_sizes_tempfile > $assets_tempfile - - rm $asset_names_tempfile $asset_metadata_tempfile $asset_urls_tempfile $asset_sizes_tempfile - - while read -r line; do - asset=$(echo "$line" | cut -f1 -d' ') - asset_metadata_url=$(echo "$line" | cut -f2 -d' ') - asset_url=$(echo "$line" | cut -f3 -d' ') - asset_size=$(echo "$line" | cut -f4 -d' ') - - if [ $asset = $LIBPOSTAL_DATA_FILE ] && ([ $FILE = "base" ] || [ $FILE = "all" ]); then - download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS - fi - if [ $asset = $LIBPOSTAL_PARSER_FILE ] && ([ $FILE = "parser" ] || [ $FILE = "all" ]); then - download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR - fi - if [ $asset = $LIBPOSTAL_LANG_CLASS_FILE ] && ([ $FILE = "language_classifier" ] || [ $FILE = "all" ]); then - download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR - fi - - if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then - echo $LIBPOSTAL_VERSION_STRING > $LIBPOSTAL_DATA_VERSION_FILE; - fi - done < $assets_tempfile; - rm $assets_tempfile - -elif [ $COMMAND = "upload" ]; then - echo "upload not implemented yet" - - #if [ $FILE = "base" ] || [ $FILE = "all" ]; then - # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS - # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY/$LIBPOSTAL_DATA_S3_PREFIX/ - #fi - - #if [ $FILE = "parser" ] || [ $FILE = "all" ]; then - # latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest) - # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR - # parser_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/" - # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir - #fi - - #if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then - # latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest) - # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR - # lang_class_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class/" - # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir - #fi -else - echo "Invalid command: $COMMAND" - exit 1 -fi diff --git a/src/libpostal_data.in b/src/libpostal_data.in index 882ae0cf..176b1a4c 100755 --- a/src/libpostal_data.in +++ b/src/libpostal_data.in @@ -178,27 +178,6 @@ if [ $COMMAND = "download" ]; then download_release $LIBPOSTAL_LANG_CLASS_MODEL_VERSION_PATH $LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR fi -elif [ $COMMAND = "upload" ]; then - echo "upload not implemented yet" - - #if [ $FILE = "base" ] || [ $FILE = "all" ]; then - # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS - # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY/$LIBPOSTAL_DATA_S3_PREFIX/ - #fi - - #if [ $FILE = "parser" ] || [ $FILE = "all" ]; then - # latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest) - # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR - # parser_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/" - # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir - #fi - - #if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then - # latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest) - # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR - # lang_class_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class/" - # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir - #fi else echo "Invalid command: $COMMAND" exit 1 From 78fc457e759cfcd0d2ad2ada0b423dde6cc05fc8 Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 13 Dec 2018 14:39:24 -0500 Subject: [PATCH 04/11] [build/windows] attempting same changes to the Windows configure script --- windows/configure.ac | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/windows/configure.ac b/windows/configure.ac index 0a964cf5..c45564d2 100644 --- a/windows/configure.ac +++ b/windows/configure.ac @@ -2,8 +2,8 @@ # Process this file with autoconf to produce a configure script. m4_define(LIBPOSTAL_MAJOR_VERSION, [1]) -m4_define(LIBPOSTAL_MINOR_VERSION, [0]) -m4_define(LIBPOSTAL_PATCH_VERSION, [0]) +m4_define(LIBPOSTAL_MINOR_VERSION, [1]) +m4_define(LIBPOSTAL_PATCH_VERSION, [alpha]) AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION) @@ -50,10 +50,21 @@ AC_CHECK_TYPES([ptrdiff_t]) # Checks for library functions. AC_CHECK_FUNCS([malloc realloc drand48 getcwd gettimeofday memmove memset regcomp setlocale sqrt strdup strndup]) +AC_SUBST([LIBPOSTAL_DATA_DIR_VERSION_STRING], [v1]) + +DATA_FILE_LATEST_VERSION=$(cat $srcdir/versions/base_data) +PARSER_MODEL_LATEST_VERSION=$(cat $srcdir/versions/parser) +LANG_CLASS_MODEL_LATEST_VERSION=$(cat $srcdir/versions/language_classifier) + +AC_SUBST([LIBPOSTAL_DATA_FILE_LATEST_VERSION], [$DATA_FILE_LATEST_VERSION]) +AC_SUBST([LIBPOSTAL_PARSER_MODEL_LATEST_VERSION], [$PARSER_MODEL_LATEST_VERSION]) +AC_SUBST([LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION], [$LANG_CLASS_MODEL_LATEST_VERSION]) + AC_CONFIG_FILES([Makefile libpostal.pc src/Makefile - test/Makefile]) + src/libpostal_data + test/Makefile], [chmod +x src/libpostal_data]) AC_CHECK_PROG([FOUND_SHUF], [shuf], [yes]) AC_CHECK_PROG([FOUND_GSHUF], [gshuf], [yes]) From 9ae41dfaa49d2f46cd1cf0c4c1f8b05d33940f3d Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 29 Dec 2018 17:45:57 -0500 Subject: [PATCH 05/11] [fix] base_data should be v1.1-alpha --- versions/base | 1 - versions/base_data | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 100644 versions/base diff --git a/versions/base b/versions/base deleted file mode 100644 index a8597845..00000000 --- a/versions/base +++ /dev/null @@ -1 +0,0 @@ -v1.1-alpha \ No newline at end of file diff --git a/versions/base_data b/versions/base_data index 60453e69..a8597845 100644 --- a/versions/base_data +++ b/versions/base_data @@ -1 +1 @@ -v1.0.0 \ No newline at end of file +v1.1-alpha \ No newline at end of file From a60aa9bfb12f154e1f84cc9abe968c4adb6ed102 Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 29 Dec 2018 20:16:14 -0500 Subject: [PATCH 06/11] [fix] seem to not be able to use a non-numeric version number in the Windows configure script, hopefully that fixes the Appveyor build --- windows/configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/windows/configure.ac b/windows/configure.ac index c45564d2..6e920ce3 100644 --- a/windows/configure.ac +++ b/windows/configure.ac @@ -3,7 +3,7 @@ m4_define(LIBPOSTAL_MAJOR_VERSION, [1]) m4_define(LIBPOSTAL_MINOR_VERSION, [1]) -m4_define(LIBPOSTAL_PATCH_VERSION, [alpha]) +m4_define(LIBPOSTAL_PATCH_VERSION, [0]) AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION) From acebc951b60ea8a922eec06a14d75473127b1a52 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 12 Feb 2019 12:32:34 -0500 Subject: [PATCH 07/11] [fix] != instead of -ne --- src/libpostal_data.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libpostal_data.in b/src/libpostal_data.in index 176b1a4c..a50341a8 100755 --- a/src/libpostal_data.in +++ b/src/libpostal_data.in @@ -123,7 +123,7 @@ download_release() { echo "Checking for new libpostal $name..." - if [ $current_version -ne $version ]; then + if [ "$current_version" != "$version" ]; then echo "New libpostal $name available" if [ $num_chunks -gt 1 ]; then From fdb3b7e32e2318632e9752dbf091a0f3391fc980 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 12 Feb 2019 13:32:58 -0500 Subject: [PATCH 08/11] =?UTF-8?q?[build]=20fixing=20multi-part=20downloads?= =?UTF-8?q?,=20which=20no=20longer=20know=20the=20Content-Length=20so=20no?= =?UTF-8?q?=20longer=20know=20the=20byte=20ange=20for=20the=20last=20chunk?= =?UTF-8?q?.=20No=20longer=20needs=20to=20use=20any=20file=20dates/timesta?= =?UTF-8?q?mps=20(compatibility=20=F0=9F=8E=89),=20only=20the=20version=20?= =?UTF-8?q?numbers=20that=20are=20checked=20in=20to=20the=20repo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/libpostal_data.in | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/src/libpostal_data.in b/src/libpostal_data.in index a50341a8..c181e5b2 100755 --- a/src/libpostal_data.in +++ b/src/libpostal_data.in @@ -47,6 +47,7 @@ BASIC_MODULE_DIRS="address_expansions numex transliteration" PARSER_MODULE_DIR=address_parser LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier +NUM_WORKERS=12 kill_background_processes() { jobs -p | xargs kill; @@ -71,22 +72,18 @@ download_release_multipart() { while [ $i -lt $num_chunks ]; do i=$((i+1)) part_filename="$filename.$i" - if [ $i -lt $num_chunks ]; then - max=$((offset+CHUNK_SIZE-1)); - else - max=""; - fi; + max=$((offset+CHUNK_SIZE-1)); printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename" offset=$((offset+CHUNK_SIZE)) done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" -- - > $local_path + > $filename i=0 while [ $i -lt $num_chunks ]; do i=$((i+1)) part_filename="$filename.$i" - cat $part_filename >> $local_path + cat $part_filename >> $filename rm $part_filename done; @@ -103,14 +100,6 @@ download_release() { shift 6 subdirs=$@ - echo $version_file_path - echo $version - echo $data_dir - echo $num_chunks - echo $filename - echo $name - echo $subdirs - local_path=$data_dir/$filename url=$LIBPOSTAL_BASE_URL/$version/$filename @@ -118,7 +107,8 @@ download_release() { if [ ! -e $version_file_path ]; then current_version="" else - current_version="$(cat $version_path)" + current_version="$(cat $version_file_path)" + fi; echo "Checking for new libpostal $name..." @@ -132,16 +122,12 @@ download_release() { curl -L $url --retry 3 --retry-delay 2 -o $local_path fi - if date -ud "@$(date -ur . +%s)" >/dev/null 2>&1; then - echo $(date -ud "$(date -ud "@$(date -ur $local_path +%s)") + 1 second") > $updated_path; - elif stat -f %Sm . >/dev/null 2>&1; then - echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path; - fi; for subdir in $subdirs; do rm -rf $data_dir/$subdir; done tar -xvzf $local_path --no-same-owner -C $data_dir; rm $local_path; + echo "$version" > $version_file_path; else echo "libpostal $name up to date" fi From 6a50eb7b22527c026c9b82fde7e236803f15daf2 Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 12 Feb 2019 13:34:54 -0500 Subject: [PATCH 09/11] [fix/build] base_data should currently be at v1.0.0 in terms of where the files are uploaded --- versions/base_data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/versions/base_data b/versions/base_data index a8597845..60453e69 100644 --- a/versions/base_data +++ b/versions/base_data @@ -1 +1 @@ -v1.1-alpha \ No newline at end of file +v1.0.0 \ No newline at end of file From 3c0e97ae59b7d4cc1d2ff171d2d9f5373189e76c Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 12 Feb 2019 14:09:03 -0500 Subject: [PATCH 10/11] [fix] patch version also has to be an integer --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index abb6d41a..f44582c9 100644 --- a/configure.ac +++ b/configure.ac @@ -3,7 +3,7 @@ m4_define(LIBPOSTAL_MAJOR_VERSION, [1]) m4_define(LIBPOSTAL_MINOR_VERSION, [1]) -m4_define(LIBPOSTAL_PATCH_VERSION, [alpha]) +m4_define(LIBPOSTAL_PATCH_VERSION, [0]) AC_INIT([libpostal], LIBPOSTAL_MAJOR_VERSION.LIBPOSTAL_MINOR_VERSION.LIBPOSTAL_PATCH_VERSION) From 7a23d8922ad0dcaf116c71453d46a32e14f4573b Mon Sep 17 00:00:00 2001 From: Al Date: Tue, 12 Feb 2019 14:10:02 -0500 Subject: [PATCH 11/11] [build] data dir version handling --- src/libpostal_data.in | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/libpostal_data.in b/src/libpostal_data.in index c181e5b2..0a3d27f2 100755 --- a/src/libpostal_data.in +++ b/src/libpostal_data.in @@ -136,20 +136,21 @@ download_release() { if [ $COMMAND = "download" ]; then if [ -e $LIBPOSTAL_DATA_VERSION_FILE ]; then LIBPOSTAL_DATA_DIR_VERSION=$(cat $LIBPOSTAL_DATA_VERSION_FILE) - fi - if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_DATA_DIR_VERSION_STRING" ]; then - echo "Old version of datadir detected, removing..." - for subdir in $BASIC_MODULE_DIRS $PARSER_MODULE_DIR $LANGUAGE_CLASSIFIER_MODULE_DIR; do - rm -rf $LIBPOSTAL_DATA_DIR/$subdir; - done + if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_DATA_DIR_VERSION_STRING" ]; then + echo "Old version of datadir detected, removing..." + for subdir in $BASIC_MODULE_DIRS $PARSER_MODULE_DIR $LANGUAGE_CLASSIFIER_MODULE_DIR; do + rm -rf $LIBPOSTAL_DATA_DIR/$subdir; + done - # Legacy, blow it away too to be nice - if [ -e $LIBPOSTAL_DATA_DIR/geodb ]; then - rm -rf $LIBPOSTAL_DATA_DIR/geodb; + # Legacy, blow it away too to be nice + if [ -e $LIBPOSTAL_DATA_DIR/geodb ]; then + rm -rf $LIBPOSTAL_DATA_DIR/geodb; + fi + + rm -f $LIBPOSTAL_DATA_DIR/last_updated* + rm -f $LIBPOSTAL_DATA_DIR/*_version fi - - rm -f $LIBPOSTAL_DATA_DIR/last_updated* fi mkdir -p $LIBPOSTAL_DATA_DIR @@ -164,6 +165,8 @@ if [ $COMMAND = "download" ]; then download_release $LIBPOSTAL_LANG_CLASS_MODEL_VERSION_PATH $LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR fi + echo "$LIBPOSTAL_DATA_DIR_VERSION_STRING" > $LIBPOSTAL_DATA_VERSION_FILE + else echo "Invalid command: $COMMAND" exit 1