diff --git a/src/libpostal_data b/src/libpostal_data index e653c9a2..4cd2eb79 100755 --- a/src/libpostal_data +++ b/src/libpostal_data @@ -7,6 +7,8 @@ if [ "$#" -lt 3 ]; then exit 1 fi +LIBPOSTAL_VERSION_STRING="v1" + LIBPOSTAL_S3_BUCKET_NAME="libpostal" LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME" LIBPOSTAL_S3_BUCKET_URL="http://$LIBPOSTAL_S3_BUCKET_NAME.s3.amazonaws.com" @@ -14,6 +16,10 @@ LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz" LIBPOSTAL_PARSER_FILE="parser.tar.gz" LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz" +LIBPOSTAL_DATA_S3_PREFIX="data/$LIBPOSTAL_VERSION_STRING" +LIBPOSTAL_PARSER_S3_PREFIX="models/address_parser" +LIBPOSTAL_LANG_CLASS_S3_PREFIX="models/language_classifier" + COMMAND=$1 FILE=$2 LIBPOSTAL_DATA_DIR=$3 @@ -89,8 +95,11 @@ download_multipart() { download_file() { updated_path=$1 data_dir=$2 - filename=$3 - name=$4 + prefix=$3 + filename=$4 + name=$5 + shift 5 + subdirs=$@ local_path=$data_dir/$filename @@ -100,7 +109,7 @@ download_file() { echo "Checking for new libpostal $name..." - url=$LIBPOSTAL_S3_BUCKET_URL/$filename + url=$LIBPOSTAL_S3_BUCKET_URL/$prefix/$filename if [ $(curl -sI $url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then echo "New libpostal $name available" @@ -117,6 +126,9 @@ download_file() { elif stat -f %Sm . >/dev/null 2>&1; then echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path; fi; + for subdir in $subdirs; do + rm -rf $data_dir/$subdir; + done tar -xvzf $local_path -C $data_dir; rm $local_path; else @@ -128,32 +140,38 @@ if [ $COMMAND = "download" ]; then mkdir -p $LIBPOSTAL_DATA_DIR if [ $FILE = "base" ] || [ $FILE = "all" ]; then - download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE "data file" + download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_S3_PREFIX $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS fi if [ $FILE = "parser" ] || [ $FILE = "all" ]; then - latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/models/address_parser/latest) - parser_filename="models/address_parser/$latest_parser/$LIBPOSTAL_PARSER_FILE" - download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $parser_filename "parser data file" + latest_parser=$(curl --silent "$LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest") + parser_s3_prefix="$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser" + download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $parser_s3_prefix $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR fi if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then - download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" + latest_lang_class=$(curl --silent "$LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest") + lang_class_s3_prefix="$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class" + download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $lang_class_s3_prefix $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR fi elif [ $COMMAND = "upload" ]; then if [ $FILE = "base" ] || [ $FILE = "all" ]; then tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS - aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY + aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_DATA_S3_PREFIX fi if [ $FILE = "parser" ] || [ $FILE = "all" ]; then + latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest) tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR - aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $LIBPOSTAL_S3_KEY + parser_s3_dir="$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/" + aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir fi if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then + latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest) tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR - aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LIBPOSTAL_S3_KEY + lang_class_s3_dir="LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class" + aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir fi else