diff --git a/bootstrap.sh b/bootstrap.sh index 302de29c..3894e867 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -1,2 +1,2 @@ -#!/usr/bin/env bash +#!/bin/sh autoreconf -fi --warning=no-portability diff --git a/src/libpostal_data b/src/libpostal_data index 27bcf30d..671a327a 100755 --- a/src/libpostal_data +++ b/src/libpostal_data @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh set -e @@ -26,7 +26,7 @@ LIBPOSTAL_GEO_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_geo LIBPOSTAL_PARSER_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_parser LIBPOSTAL_LANG_CLASS_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_language_classifier -BASIC_MODULE_DIRS=(address_expansions numex transliteration) +BASIC_MODULE_DIRS="address_expansions numex transliteration" GEODB_MODULE_DIR=geodb PARSER_MODULE_DIR=address_parser LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier @@ -43,14 +43,14 @@ LARGE_FILE_SIZE=$((CHUNK_SIZE*2)) NUM_WORKERS=10 -function kill_background_processes { +kill_background_processes() { jobs -p | xargs kill; exit } -trap kill_background_processes SIGINT +trap kill_background_processes INT -function download_part() { +download_part() { i=$1 offset=$2 max=$3 @@ -59,9 +59,8 @@ function download_part() { echo "Downloading part $i: filename=$part_filename, offset=$offset, max=$max" curl $url --silent -H"Range:bytes=$offset-$max" -o $part_filename } -export -f download_part -function download_multipart() { +download_multipart() { url=$1 filename=$2 size=$3 @@ -79,9 +78,13 @@ function download_multipart() { else max=$size; fi; - printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename" + download_part "$i" "$offset" "$max" "$url" "$part_filename" & + # wait every time we have started $num_workers processes + [ $((i%num_workers)) -eq 0 ] && wait offset=$((offset+CHUNK_SIZE)) - done | xargs -0 -n 5 -P $NUM_WORKERS bash -c 'download_part "$@"' -- + done + # wait if $num_chunks wasn't exactly divisible by $num_workers + wait > $local_path @@ -96,7 +99,7 @@ function download_multipart() { } -function download_file() { +download_file() { updated_path=$1 data_dir=$2 filename=$3 @@ -153,7 +156,7 @@ if [ $COMMAND = "download" ]; then elif [ $COMMAND = "upload" ]; then if [ $FILE = "base" ] || [ $FILE = "all" ]; then - tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE ${BASIC_MODULE_DIRS[*]} + tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY fi