Files
libpostal/src/libpostal_data

172 lines
5.3 KiB
Bash
Executable File

#!/bin/sh
set -e
if [ "$#" -lt 3 ]; then
echo "Usage: ./libpostal_data [upload|download] [base|geodb] data_dir"
exit 1
fi
LIBPOSTAL_S3_BUCKET_NAME="libpostal"
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"
LIBPOSTAL_S3_BUCKET_URL="http://$LIBPOSTAL_S3_BUCKET_NAME.s3.amazonaws.com"
LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
LIBPOSTAL_GEODB_FILE="geodb.tar.gz"
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
COMMAND=$1
FILE=$2
LIBPOSTAL_DATA_DIR=$3
mkdir -p $LIBPOSTAL_DATA_DIR
LIBPOSTAL_DATA_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated
LIBPOSTAL_GEO_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_geo
LIBPOSTAL_PARSER_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_parser
LIBPOSTAL_LANG_CLASS_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_language_classifier
BASIC_MODULE_DIRS="address_expansions numex transliteration"
GEODB_MODULE_DIR=geodb
PARSER_MODULE_DIR=address_parser
LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier
export LC_ALL=C
EPOCH_DATE="Jan 1 00:00:00 1970"
MB=$((1024*1024))
CHUNK_SIZE=$((64*$MB))
LARGE_FILE_SIZE=$((CHUNK_SIZE*2))
NUM_WORKERS=10
kill_background_processes() {
jobs -p | xargs kill;
exit
}
trap kill_background_processes INT
PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"'
PART_CURL='curl $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5'
DOWNLOAD_PART="$PART_MSG;$PART_CURL"
download_multipart() {
url=$1
filename=$2
size=$3
num_chunks=$((size/CHUNK_SIZE))
echo "Downloading multipart: $url, size=$size, num_chunks=$num_chunks"
offset=0
i=0
while [ $i -lt $num_chunks ]; do
i=$((i+1))
part_filename="$filename.$i"
if [ $i -lt $num_chunks ]; then
max=$((offset+CHUNK_SIZE-1));
else
max=$size;
fi;
printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename"
offset=$((offset+CHUNK_SIZE))
done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" --
> $local_path
i=0
while [ $i -lt $num_chunks ]; do
i=$((i+1))
part_filename="$filename.$i"
cat $part_filename >> $local_path
rm $part_filename
done;
}
download_file() {
updated_path=$1
data_dir=$2
filename=$3
name=$4
local_path=$data_dir/$filename
if [ ! -e $updated_path ]; then
echo "$EPOCH_DATE" > $updated_path;
fi;
echo "Checking for new libpostal $name..."
url=$LIBPOSTAL_S3_BUCKET_URL/$filename
if [ $(curl -sI $url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then
echo "New libpostal $name available"
content_length=$(curl -I $url 2> /dev/null | awk '/^Content-Length:/ { print $2 }' | tr -d '[[:space:]]')
if [ $content_length -ge $LARGE_FILE_SIZE ]; then
download_multipart $url $local_path $content_length
else
curl $url --retry 3 --retry-delay 2 -o $local_path
fi
if date -ur . >/dev/null 2>&1; then
echo $(date -d "$(date -d "@$(date -ur $local_path +%s)") + 1 second") > $updated_path;
elif stat -f %Sm . >/dev/null 2>&1; then
echo $(date -r $(stat -f %m $local_path) -v+1S) > $updated_path;
fi;
tar -xvzf $local_path -C $data_dir;
rm $local_path;
else
echo "libpostal $name up to date"
fi
}
if [ $COMMAND = "download" ]; then
mkdir -p $LIBPOSTAL_DATA_DIR
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE "data file"
fi
if [ $FILE = "geodb" ] || [ $FILE = "all" ]; then
download_file $LIBPOSTAL_GEO_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_GEODB_FILE "geodb data file"
fi
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_PARSER_FILE "parser data file"
fi
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file"
fi
elif [ $COMMAND = "upload" ]; then
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY
fi
if [ $FILE = "geodb" ] || [ $FILE = "all" ]; then
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_GEODB_FILE $GEODB_MODULE_DIR
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_GEODB_FILE $LIBPOSTAL_S3_KEY
fi
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $LIBPOSTAL_S3_KEY
fi
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LIBPOSTAL_S3_KEY
fi
else
echo "Invalid command: $COMMAND"
exit 1
fi