diff --git a/.travis.yml b/.travis.yml index ab34fbd0..1f3dbacf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,16 +13,20 @@ env: - DICTIONARIES_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/dictionaries/.*/.*.txt\|src/gazetteer_data.c" | wc -l) - NUMEX_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "resources/numex\|src/numex_table_builder.c" | wc -l) - TRANSLIT_CHANGED=$(git diff --name-only $TRAVIS_COMMIT_RANGE | grep "src/transliteration_data.c" | wc -l) + - TAG_VERSION=$(cat ./versions/base).$TRAVIS_BUILD_NUMBER + - SRC_TARBALL_FILENAME=libpostal-$(cat ./versions/base).tar.gz + - LIBPOSTAL_DATA_DIR=$(pwd)/data + - LIBPOSTAL_DATA_FILENAME=libpostal_data.tar.gz compiler: - clang - gcc addons: apt: sources: - - ubuntu-toolchain-r-test + - ubuntu-toolchain-r-test packages: - - gcc-4.8 - - pkg-config + - gcc-4.8 + - pkg-config before_script: - ./bootstrap.sh - if [[ $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 ]]; then git clone https://github.com/pypa/virtualenv; cd virtualenv; git checkout master; python virtualenv.py ../env; cd ..; env/bin/pip install -r scripts/requirements-simple.txt; fi; @@ -31,23 +35,49 @@ before_script: install: - if [ "$CC" = "gcc" ]; then export CC="gcc-4.8"; fi script: - - ./configure --datadir=$(pwd)/data + - ./configure --datadir=$LIBPOSTAL_DATA_DIR - make -j4 - if [[ $DICTIONARIES_CHANGED -ne 0 ]]; then ./src/build_address_dictionary; fi; - if [[ $NUMEX_CHANGED -ne 0 ]]; then ./src/build_numex_table; fi; - if [[ $TRANSLIT_CHANGED -ne 0 ]]; then ./src/build_trans_table; fi; - make check + after_success: - | - if [[ "$CC" == gcc* && "$TRAVIS_PULL_REQUEST" = "false" && "$TRAVIS_BRANCH" = "master" && ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 ) ]]; then - env/bin/pip install awscli; - export PATH=$PATH:env/bin/; - ./src/libpostal_data upload base $(pwd)/data/libpostal; - git clone -b master "https://${GH_TOKEN}@${GH_REF}" _travis > /dev/null 2>&1 || exit 1 - cp src/*_data.c _travis/src - cd _travis - git config user.name "$GIT_COMMITTER_NAME"; - git config user.email "$GIT_COMMITTER_EMAIL"; - git commit -a -m "[auto][ci skip] Adding data files from Travis build #$TRAVIS_BUILD_NUMBER"; - git push --quiet origin master; + if [[ "$CC" == "gcc" && "$TRAVIS_PULL_REQUEST" = "false" && "$TRAVIS_BRANCH" = "master" ]]; then + if [[ ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 ) ]]; then + export PATH=$PATH:env/bin/; + git clone -b master "https://${GH_TOKEN}@${GH_REF}" _travis > /dev/null 2>&1 || exit 1 + cp src/*_data.c _travis/src + echo "$TAG_VERSION" > _travis/versions/base_data + cd _travis + git config user.name "$GIT_COMMITTER_NAME"; + git config user.email "$GIT_COMMITTER_EMAIL"; + git commit -a -m "[auto][ci skip] Adding data files from Travis build #$TRAVIS_BUILD_NUMBER"; + git push --quiet origin master; + + tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME $BASIC_MODULE_DIRS + fi + git tag $TAG_VERSION -a -m "[auto][ci skip] Generating tag for Travis build #$TRAVIS_BUILD_NUMBER"; + git push --tags --quiet origin master; fi; + +before_deploy: + - make dist + +deploy: + - provider: releases + file: + - "$SRC_TARBALL_FILENAME" + on: + tags: true + branch: master + skip_cleanup: true + - provider: releases + file: + - "$LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILENAME" + on: + tags: true + branch: master + condition: "$CC = gcc && ( $DICTIONARIES_CHANGED -ne 0 || $NUMEX_CHANGED -ne 0 || $TRANSLIT_CHANGED -ne 0 )" + skip_cleanup: true diff --git a/src/libpostal_data b/src/libpostal_data deleted file mode 100755 index 77c2dd0b..00000000 --- a/src/libpostal_data +++ /dev/null @@ -1,232 +0,0 @@ -#!/bin/sh - -set -e - -if [ "$#" -lt 3 ]; then - echo "Usage: ./libpostal_data [upload|download] [base|parser|language_classifier|all] data_dir" - exit 1 -fi - -LIBPOSTAL_VERSION_STRING="v1" -LIBPOSTAL_RELEASE_VERSION_STRING="v1.0.0" - -LIBPOSTAL_REPO_NAME="openvenues/libpostal" -LIBPOSTAL_S3_BUCKET_NAME="libpostal" -LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME" - -GITHUB_API_URL="https://api.github.com" -LIBPOSTAL_RELEASE_API_URL="$GITHUB_API_URL/repos/$LIBPOSTAL_REPO_NAME/releases" - -LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz" -LIBPOSTAL_PARSER_FILE="parser.tar.gz" -LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz" - -LIBPOSTAL_DATA_S3_PREFIX="$LIBPOSTAL_LATEST_MODEL_VERSION_STRING/parser.tar.gz" -LIBPOSTAL_PARSER_S3_PREFIX="$LIBPOSTAL_LATEST_DATA_VERSION_STRING/libpostal_data.tar.gz" -LIBPOSTAL_LANG_CLASS_S3_PREFIX="$LIBPOSTAL_LATEST_MODEL_VERSION_STRING/language_classifier.tar.gz" - -COMMAND=$1 -FILE=$2 -LIBPOSTAL_DATA_DIR=$3 - -LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version -LIBPOSTAL_DATA_DIR_VERSION= - -mkdir -p $LIBPOSTAL_DATA_DIR - -LIBPOSTAL_DATA_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated -LIBPOSTAL_PARSER_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_parser -LIBPOSTAL_LANG_CLASS_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_language_classifier - -BASIC_MODULE_DIRS="address_expansions numex transliteration" -PARSER_MODULE_DIR=address_parser -LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier - -export LC_ALL=C - -EPOCH_DATE="Jan 1 00:00:00 1970" - -MB=$((1024*1024)) -CHUNK_SIZE=$((64*$MB)) - -LARGE_FILE_SIZE=$((CHUNK_SIZE*2)) - - -NUM_WORKERS=12 - -kill_background_processes() { - jobs -p | xargs kill; - exit -} - -trap kill_background_processes INT - -PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"' -PART_CURL='curl -L $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5' -DOWNLOAD_PART="$PART_MSG;$PART_CURL" - - -download_multipart() { - url=$1 - filename=$2 - size=$3 - - num_chunks=$((size/CHUNK_SIZE)) - echo "Downloading multipart: $url, size=$size, num_chunks=$num_chunks" - offset=0 - i=0 - while [ $i -lt $num_chunks ]; do - i=$((i+1)) - part_filename="$filename.$i" - if [ $i -lt $num_chunks ]; then - max=$((offset+CHUNK_SIZE-1)); - else - max=$size; - fi; - printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename" - offset=$((offset+CHUNK_SIZE)) - done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" -- - - > $local_path - - i=0 - while [ $i -lt $num_chunks ]; do - i=$((i+1)) - part_filename="$filename.$i" - cat $part_filename >> $local_path - rm $part_filename - done; - -} - - -download_file() { - updated_path=$1 - data_dir=$2 - metadata_url=$3 - url=$4 - size=$5 - filename=$6 - name=$7 - shift 7 - subdirs=$@ - - local_path=$data_dir/$filename - - if [ ! -e $updated_path ]; then - echo "$EPOCH_DATE" > $updated_path; - fi; - - echo "Checking for new libpostal $name..." - - if [ $(curl -LsI $metadata_url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then - echo "New libpostal $name available" - - if [ $size -ge $LARGE_FILE_SIZE ]; then - download_multipart $url $local_path $size - else - curl -L $url --retry 3 --retry-delay 2 -o $local_path - fi - - if date -ud "@$(date -ur . +%s)" >/dev/null 2>&1; then - echo $(date -ud "$(date -ud "@$(date -ur $local_path +%s)") + 1 second") > $updated_path; - elif stat -f %Sm . >/dev/null 2>&1; then - echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path; - fi; - for subdir in $subdirs; do - rm -rf $data_dir/$subdir; - done - tar -xvzf $local_path --no-same-owner -C $data_dir; - rm $local_path; - else - echo "libpostal $name up to date" - fi -} - -if [ $COMMAND = "download" ]; then - if [ -e $LIBPOSTAL_DATA_VERSION_FILE ]; then - LIBPOSTAL_DATA_DIR_VERSION=$(cat $LIBPOSTAL_DATA_VERSION_FILE) - fi - - if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then - echo "Old version of datadir detected, removing..." - for subdir in $BASIC_MODULE_DIRS $PARSER_MODULE_DIR $LANGUAGE_CLASSIFIER_MODULE_DIR; do - rm -rf $LIBPOSTAL_DATA_DIR/$subdir; - done - - # Legacy, blow it away too to be nice - if [ -e $LIBPOSTAL_DATA_DIR/geodb ]; then - rm -rf $LIBPOSTAL_DATA_DIR/geodb; - fi - - rm -f $LIBPOSTAL_DATA_UPDATED_PATH - rm -f $LIBPOSTAL_LANG_CLASS_UPDATED_PATH - rm -f $LIBPOSTAL_PARSER_UPDATED_PATH - fi - - mkdir -p $LIBPOSTAL_DATA_DIR - - release_id=$(curl -s $LIBPOSTAL_RELEASE_API_URL/tags/$LIBPOSTAL_RELEASE_VERSION_STRING | grep "\"id\"" | head -n1 | grep -o '[0-9][0-9]*') - release_assets="$(curl -s $LIBPOSTAL_RELEASE_API_URL/$release_id/assets)" - - asset_names_tempfile="$LIBPOSTAL_DATA_DIR/asset_names.tmp" - echo "$release_assets" | grep -o '"name": *"[^"]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_names_tempfile - asset_metadata_tempfile="$LIBPOSTAL_DATA_DIR/asset_metadata.tmp" - echo "$release_assets" | grep -o '"url": *"[^"]*/releases/assets/[0-9]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_metadata_tempfile - asset_urls_tempfile="$LIBPOSTAL_DATA_DIR/asset_urls.tmp" - echo "$release_assets" | grep -o '"browser_download_url": *"[^"]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_urls_tempfile - asset_sizes_tempfile="$LIBPOSTAL_DATA_DIR/asset_sizes.tmp" - echo "$release_assets" | grep -o '"size": *[0-9]*' | grep -o '[0-9]*$' > $asset_sizes_tempfile - - assets_tempfile="$LIBPOSTAL_DATA_DIR/assets.tmp" - paste -d' ' $asset_names_tempfile $asset_metadata_tempfile $asset_urls_tempfile $asset_sizes_tempfile > $assets_tempfile - - rm $asset_names_tempfile $asset_metadata_tempfile $asset_urls_tempfile $asset_sizes_tempfile - - while read -r line; do - asset=$(echo "$line" | cut -f1 -d' ') - asset_metadata_url=$(echo "$line" | cut -f2 -d' ') - asset_url=$(echo "$line" | cut -f3 -d' ') - asset_size=$(echo "$line" | cut -f4 -d' ') - - if [ $asset = $LIBPOSTAL_DATA_FILE ] && ([ $FILE = "base" ] || [ $FILE = "all" ]); then - download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS - fi - if [ $asset = $LIBPOSTAL_PARSER_FILE ] && ([ $FILE = "parser" ] || [ $FILE = "all" ]); then - download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR - fi - if [ $asset = $LIBPOSTAL_LANG_CLASS_FILE ] && ([ $FILE = "language_classifier" ] || [ $FILE = "all" ]); then - download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR - fi - - if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then - echo $LIBPOSTAL_VERSION_STRING > $LIBPOSTAL_DATA_VERSION_FILE; - fi - done < $assets_tempfile; - rm $assets_tempfile - -elif [ $COMMAND = "upload" ]; then - echo "upload not implemented yet" - - #if [ $FILE = "base" ] || [ $FILE = "all" ]; then - # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS - # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY/$LIBPOSTAL_DATA_S3_PREFIX/ - #fi - - #if [ $FILE = "parser" ] || [ $FILE = "all" ]; then - # latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest) - # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR - # parser_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/" - # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir - #fi - - #if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then - # latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest) - # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR - # lang_class_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class/" - # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir - #fi -else - echo "Invalid command: $COMMAND" - exit 1 -fi diff --git a/src/libpostal_data.in b/src/libpostal_data.in index 882ae0cf..176b1a4c 100755 --- a/src/libpostal_data.in +++ b/src/libpostal_data.in @@ -178,27 +178,6 @@ if [ $COMMAND = "download" ]; then download_release $LIBPOSTAL_LANG_CLASS_MODEL_VERSION_PATH $LIBPOSTAL_LANG_CLASS_MODEL_LATEST_VERSION $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_MODEL_CHUNKS $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR fi -elif [ $COMMAND = "upload" ]; then - echo "upload not implemented yet" - - #if [ $FILE = "base" ] || [ $FILE = "all" ]; then - # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS - # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY/$LIBPOSTAL_DATA_S3_PREFIX/ - #fi - - #if [ $FILE = "parser" ] || [ $FILE = "all" ]; then - # latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest) - # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR - # parser_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/" - # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir - #fi - - #if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then - # latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest) - # tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR - # lang_class_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class/" - # aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir - #fi else echo "Invalid command: $COMMAND" exit 1