[data/models] updating libpostal download script to download new models. The simple data files are stored by libpostal major version, whereas the models are stored by the version of the training data they used. A file called "latest" is stored in S3 to indicate the latest version of the model and checked on make

This commit is contained in:
Al
2017-03-31 13:35:07 -04:00
parent 6d4c7984df
commit a64c81b45b

View File

@@ -7,6 +7,8 @@ if [ "$#" -lt 3 ]; then
exit 1
fi
LIBPOSTAL_VERSION_STRING="v1"
LIBPOSTAL_S3_BUCKET_NAME="libpostal"
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"
LIBPOSTAL_S3_BUCKET_URL="http://$LIBPOSTAL_S3_BUCKET_NAME.s3.amazonaws.com"
@@ -14,6 +16,10 @@ LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
LIBPOSTAL_DATA_S3_PREFIX="data/$LIBPOSTAL_VERSION_STRING"
LIBPOSTAL_PARSER_S3_PREFIX="models/address_parser"
LIBPOSTAL_LANG_CLASS_S3_PREFIX="models/language_classifier"
COMMAND=$1
FILE=$2
LIBPOSTAL_DATA_DIR=$3
@@ -89,8 +95,11 @@ download_multipart() {
download_file() {
updated_path=$1
data_dir=$2
filename=$3
name=$4
prefix=$3
filename=$4
name=$5
shift 5
subdirs=$@
local_path=$data_dir/$filename
@@ -100,7 +109,7 @@ download_file() {
echo "Checking for new libpostal $name..."
url=$LIBPOSTAL_S3_BUCKET_URL/$filename
url=$LIBPOSTAL_S3_BUCKET_URL/$prefix/$filename
if [ $(curl -sI $url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then
echo "New libpostal $name available"
@@ -117,6 +126,9 @@ download_file() {
elif stat -f %Sm . >/dev/null 2>&1; then
echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path;
fi;
for subdir in $subdirs; do
rm -rf $data_dir/$subdir;
done
tar -xvzf $local_path -C $data_dir;
rm $local_path;
else
@@ -128,32 +140,38 @@ if [ $COMMAND = "download" ]; then
mkdir -p $LIBPOSTAL_DATA_DIR
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE "data file"
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_S3_PREFIX $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS
fi
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/models/address_parser/latest)
parser_filename="models/address_parser/$latest_parser/$LIBPOSTAL_PARSER_FILE"
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $parser_filename "parser data file"
latest_parser=$(curl --silent "$LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest")
parser_s3_prefix="$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser"
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $parser_s3_prefix $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR
fi
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file"
latest_lang_class=$(curl --silent "$LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest")
lang_class_s3_prefix="$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class"
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $lang_class_s3_prefix $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
fi
elif [ $COMMAND = "upload" ]; then
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_DATA_S3_PREFIX
fi
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest)
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $LIBPOSTAL_S3_KEY
parser_s3_dir="$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/"
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir
fi
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest)
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LIBPOSTAL_S3_KEY
lang_class_s3_dir="LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class"
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir
fi
else