From a64c81b45b0f9dbf7975c58c494aee456b4dfa9f Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 31 Mar 2017 13:35:07 -0400 Subject: [PATCH] [data/models] updating libpostal download script to download new models. The simple data files are stored by libpostal major version, whereas the models are stored by the version of the training data they used. A file called "latest" is stored in S3 to indicate the latest version of the model and checked on make --- src/libpostal_data | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/src/libpostal_data b/src/libpostal_data index e653c9a2..4cd2eb79 100755 --- a/src/libpostal_data +++ b/src/libpostal_data @@ -7,6 +7,8 @@ if [ "$#" -lt 3 ]; then exit 1 fi +LIBPOSTAL_VERSION_STRING="v1" + LIBPOSTAL_S3_BUCKET_NAME="libpostal" LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME" LIBPOSTAL_S3_BUCKET_URL="http://$LIBPOSTAL_S3_BUCKET_NAME.s3.amazonaws.com" @@ -14,6 +16,10 @@ LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz" LIBPOSTAL_PARSER_FILE="parser.tar.gz" LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz" +LIBPOSTAL_DATA_S3_PREFIX="data/$LIBPOSTAL_VERSION_STRING" +LIBPOSTAL_PARSER_S3_PREFIX="models/address_parser" +LIBPOSTAL_LANG_CLASS_S3_PREFIX="models/language_classifier" + COMMAND=$1 FILE=$2 LIBPOSTAL_DATA_DIR=$3 @@ -89,8 +95,11 @@ download_multipart() { download_file() { updated_path=$1 data_dir=$2 - filename=$3 - name=$4 + prefix=$3 + filename=$4 + name=$5 + shift 5 + subdirs=$@ local_path=$data_dir/$filename @@ -100,7 +109,7 @@ download_file() { echo "Checking for new libpostal $name..." - url=$LIBPOSTAL_S3_BUCKET_URL/$filename + url=$LIBPOSTAL_S3_BUCKET_URL/$prefix/$filename if [ $(curl -sI $url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then echo "New libpostal $name available" @@ -117,6 +126,9 @@ download_file() { elif stat -f %Sm . >/dev/null 2>&1; then echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path; fi; + for subdir in $subdirs; do + rm -rf $data_dir/$subdir; + done tar -xvzf $local_path -C $data_dir; rm $local_path; else @@ -128,32 +140,38 @@ if [ $COMMAND = "download" ]; then mkdir -p $LIBPOSTAL_DATA_DIR if [ $FILE = "base" ] || [ $FILE = "all" ]; then - download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE "data file" + download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_S3_PREFIX $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS fi if [ $FILE = "parser" ] || [ $FILE = "all" ]; then - latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/models/address_parser/latest) - parser_filename="models/address_parser/$latest_parser/$LIBPOSTAL_PARSER_FILE" - download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $parser_filename "parser data file" + latest_parser=$(curl --silent "$LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest") + parser_s3_prefix="$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser" + download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $parser_s3_prefix $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR fi if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then - download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" + latest_lang_class=$(curl --silent "$LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest") + lang_class_s3_prefix="$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class" + download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $lang_class_s3_prefix $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR fi elif [ $COMMAND = "upload" ]; then if [ $FILE = "base" ] || [ $FILE = "all" ]; then tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS - aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY + aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_DATA_S3_PREFIX fi if [ $FILE = "parser" ] || [ $FILE = "all" ]; then + latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest) tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR - aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $LIBPOSTAL_S3_KEY + parser_s3_dir="$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/" + aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir fi if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then + latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest) tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR - aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LIBPOSTAL_S3_KEY + lang_class_s3_dir="LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class" + aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir fi else