209 lines
7.0 KiB
Bash
Executable File
209 lines
7.0 KiB
Bash
Executable File
#!/bin/sh
|
|
|
|
set -e
|
|
|
|
if [ "$#" -lt 3 ]; then
|
|
echo "Usage: ./libpostal_data [upload|download] [base|geodb] data_dir"
|
|
exit 1
|
|
fi
|
|
|
|
LIBPOSTAL_VERSION_STRING="v1"
|
|
|
|
LIBPOSTAL_S3_BUCKET_NAME="libpostal"
|
|
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"
|
|
LIBPOSTAL_S3_BUCKET_URL="https://$LIBPOSTAL_S3_BUCKET_NAME.s3.amazonaws.com"
|
|
LIBPOSTAL_CLOUDFRONT_URL="https://d1p366rbd94x8u.cloudfront.net"
|
|
LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
|
|
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
|
|
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
|
|
|
|
LIBPOSTAL_DATA_S3_PREFIX="data/$LIBPOSTAL_VERSION_STRING"
|
|
LIBPOSTAL_PARSER_S3_PREFIX="models/address_parser"
|
|
LIBPOSTAL_LANG_CLASS_S3_PREFIX="models/language_classifier"
|
|
|
|
COMMAND=$1
|
|
FILE=$2
|
|
LIBPOSTAL_DATA_DIR=$3
|
|
|
|
LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version
|
|
LIBPOSTAL_DATA_DIR_VERSION=
|
|
|
|
mkdir -p $LIBPOSTAL_DATA_DIR
|
|
|
|
LIBPOSTAL_DATA_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated
|
|
LIBPOSTAL_PARSER_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_parser
|
|
LIBPOSTAL_LANG_CLASS_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_language_classifier
|
|
|
|
BASIC_MODULE_DIRS="address_expansions numex transliteration"
|
|
PARSER_MODULE_DIR=address_parser
|
|
LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier
|
|
|
|
export LC_ALL=C
|
|
|
|
EPOCH_DATE="Jan 1 00:00:00 1970"
|
|
|
|
MB=$((1024*1024))
|
|
CHUNK_SIZE=$((64*$MB))
|
|
|
|
LARGE_FILE_SIZE=$((CHUNK_SIZE*2))
|
|
|
|
|
|
NUM_WORKERS=12
|
|
|
|
kill_background_processes() {
|
|
jobs -p | xargs kill;
|
|
exit
|
|
}
|
|
|
|
trap kill_background_processes INT
|
|
|
|
PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"'
|
|
PART_CURL='curl $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5'
|
|
DOWNLOAD_PART="$PART_MSG;$PART_CURL"
|
|
|
|
|
|
download_multipart() {
|
|
url=$1
|
|
filename=$2
|
|
size=$3
|
|
|
|
num_chunks=$((size/CHUNK_SIZE))
|
|
echo "Downloading multipart: $url, size=$size, num_chunks=$num_chunks"
|
|
offset=0
|
|
i=0
|
|
while [ $i -lt $num_chunks ]; do
|
|
i=$((i+1))
|
|
part_filename="$filename.$i"
|
|
if [ $i -lt $num_chunks ]; then
|
|
max=$((offset+CHUNK_SIZE-1));
|
|
else
|
|
max=$size;
|
|
fi;
|
|
printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename"
|
|
offset=$((offset+CHUNK_SIZE))
|
|
done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" --
|
|
|
|
> $local_path
|
|
|
|
i=0
|
|
while [ $i -lt $num_chunks ]; do
|
|
i=$((i+1))
|
|
part_filename="$filename.$i"
|
|
cat $part_filename >> $local_path
|
|
rm $part_filename
|
|
done;
|
|
|
|
}
|
|
|
|
|
|
download_file() {
|
|
updated_path=$1
|
|
data_dir=$2
|
|
prefix=$3
|
|
filename=$4
|
|
name=$5
|
|
shift 5
|
|
subdirs=$@
|
|
|
|
local_path=$data_dir/$filename
|
|
|
|
if [ ! -e $updated_path ]; then
|
|
echo "$EPOCH_DATE" > $updated_path;
|
|
fi;
|
|
|
|
echo "Checking for new libpostal $name..."
|
|
|
|
url=$LIBPOSTAL_CLOUDFRONT_URL/$prefix/$filename
|
|
|
|
if [ $(curl -sI $url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then
|
|
echo "New libpostal $name available"
|
|
content_length=$(curl -I $url 2> /dev/null | awk '/^Content-Length:/ { print $2 }' | tr -d '[[:space:]]')
|
|
|
|
if [ $content_length -ge $LARGE_FILE_SIZE ]; then
|
|
download_multipart $url $local_path $content_length
|
|
else
|
|
curl $url --retry 3 --retry-delay 2 -o $local_path
|
|
fi
|
|
|
|
if date -ud "@$(date -ur . +%s)" >/dev/null 2>&1; then
|
|
echo $(date -ud "$(date -ud "@$(date -ur $local_path +%s)") + 1 second") > $updated_path;
|
|
elif stat -f %Sm . >/dev/null 2>&1; then
|
|
echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path;
|
|
fi;
|
|
for subdir in $subdirs; do
|
|
rm -rf $data_dir/$subdir;
|
|
done
|
|
tar -xvzf $local_path --no-same-owner -C $data_dir;
|
|
rm $local_path;
|
|
else
|
|
echo "libpostal $name up to date"
|
|
fi
|
|
}
|
|
|
|
if [ $COMMAND = "download" ]; then
|
|
if [ -e $LIBPOSTAL_DATA_VERSION_FILE ]; then
|
|
LIBPOSTAL_DATA_DIR_VERSION=$(cat $LIBPOSTAL_DATA_VERSION_FILE)
|
|
fi
|
|
|
|
if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then
|
|
echo "Old version of datadir detected, removing..."
|
|
for subdir in $BASIC_MODULE_DIRS $PARSER_MODULE_DIR $LANGUAGE_CLASSIFIER_MODULE_DIR; do
|
|
rm -rf $LIBPOSTAL_DATA_DIR/$subdir;
|
|
done
|
|
|
|
# Legacy, blow it away too to be nice
|
|
if [ -e $LIBPOSTAL_DATA_DIR/geodb ]; then
|
|
rm -rf $LIBPOSTAL_DATA_DIR/geodb;
|
|
fi
|
|
|
|
rm -f $LIBPOSTAL_DATA_UPDATED_PATH
|
|
rm -f $LIBPOSTAL_LANG_CLASS_UPDATED_PATH
|
|
rm -f $LIBPOSTAL_PARSER_UPDATED_PATH
|
|
fi
|
|
|
|
mkdir -p $LIBPOSTAL_DATA_DIR
|
|
|
|
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
|
|
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_S3_PREFIX $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS
|
|
fi
|
|
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
|
latest_parser=$(curl --silent "$LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest")
|
|
parser_s3_prefix="$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser"
|
|
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $parser_s3_prefix $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR
|
|
fi
|
|
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
|
|
latest_lang_class=$(curl --silent "$LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest")
|
|
lang_class_s3_prefix="$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class"
|
|
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $lang_class_s3_prefix $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
|
|
fi
|
|
|
|
if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then
|
|
echo $LIBPOSTAL_VERSION_STRING > $LIBPOSTAL_DATA_VERSION_FILE;
|
|
fi
|
|
|
|
elif [ $COMMAND = "upload" ]; then
|
|
|
|
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
|
|
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS
|
|
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY/$LIBPOSTAL_DATA_S3_PREFIX/
|
|
fi
|
|
|
|
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
|
latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest)
|
|
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR
|
|
parser_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/"
|
|
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir
|
|
fi
|
|
|
|
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
|
|
latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest)
|
|
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR
|
|
lang_class_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class/"
|
|
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir
|
|
fi
|
|
|
|
else
|
|
echo "Invalid command: $COMMAND"
|
|
exit 1
|
|
fi
|