[data/models] updating libpostal download script to download new models. The simple data files are stored by libpostal major version, whereas the models are stored by the version of the training data they used. A file called "latest" is stored in S3 to indicate the latest version of the model and checked on make
This commit is contained in:
@@ -7,6 +7,8 @@ if [ "$#" -lt 3 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
LIBPOSTAL_VERSION_STRING="v1"
|
||||
|
||||
LIBPOSTAL_S3_BUCKET_NAME="libpostal"
|
||||
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"
|
||||
LIBPOSTAL_S3_BUCKET_URL="http://$LIBPOSTAL_S3_BUCKET_NAME.s3.amazonaws.com"
|
||||
@@ -14,6 +16,10 @@ LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
|
||||
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
|
||||
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
|
||||
|
||||
LIBPOSTAL_DATA_S3_PREFIX="data/$LIBPOSTAL_VERSION_STRING"
|
||||
LIBPOSTAL_PARSER_S3_PREFIX="models/address_parser"
|
||||
LIBPOSTAL_LANG_CLASS_S3_PREFIX="models/language_classifier"
|
||||
|
||||
COMMAND=$1
|
||||
FILE=$2
|
||||
LIBPOSTAL_DATA_DIR=$3
|
||||
@@ -89,8 +95,11 @@ download_multipart() {
|
||||
download_file() {
|
||||
updated_path=$1
|
||||
data_dir=$2
|
||||
filename=$3
|
||||
name=$4
|
||||
prefix=$3
|
||||
filename=$4
|
||||
name=$5
|
||||
shift 5
|
||||
subdirs=$@
|
||||
|
||||
local_path=$data_dir/$filename
|
||||
|
||||
@@ -100,7 +109,7 @@ download_file() {
|
||||
|
||||
echo "Checking for new libpostal $name..."
|
||||
|
||||
url=$LIBPOSTAL_S3_BUCKET_URL/$filename
|
||||
url=$LIBPOSTAL_S3_BUCKET_URL/$prefix/$filename
|
||||
|
||||
if [ $(curl -sI $url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then
|
||||
echo "New libpostal $name available"
|
||||
@@ -117,6 +126,9 @@ download_file() {
|
||||
elif stat -f %Sm . >/dev/null 2>&1; then
|
||||
echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path;
|
||||
fi;
|
||||
for subdir in $subdirs; do
|
||||
rm -rf $data_dir/$subdir;
|
||||
done
|
||||
tar -xvzf $local_path -C $data_dir;
|
||||
rm $local_path;
|
||||
else
|
||||
@@ -128,32 +140,38 @@ if [ $COMMAND = "download" ]; then
|
||||
mkdir -p $LIBPOSTAL_DATA_DIR
|
||||
|
||||
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
|
||||
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE "data file"
|
||||
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_S3_PREFIX $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS
|
||||
fi
|
||||
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
||||
latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/models/address_parser/latest)
|
||||
parser_filename="models/address_parser/$latest_parser/$LIBPOSTAL_PARSER_FILE"
|
||||
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $parser_filename "parser data file"
|
||||
latest_parser=$(curl --silent "$LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest")
|
||||
parser_s3_prefix="$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser"
|
||||
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $parser_s3_prefix $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR
|
||||
fi
|
||||
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
|
||||
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file"
|
||||
latest_lang_class=$(curl --silent "$LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest")
|
||||
lang_class_s3_prefix="$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class"
|
||||
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $lang_class_s3_prefix $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
|
||||
fi
|
||||
|
||||
elif [ $COMMAND = "upload" ]; then
|
||||
|
||||
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
|
||||
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS
|
||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY
|
||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_DATA_S3_PREFIX
|
||||
fi
|
||||
|
||||
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
||||
latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest)
|
||||
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR
|
||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $LIBPOSTAL_S3_KEY
|
||||
parser_s3_dir="$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/"
|
||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir
|
||||
fi
|
||||
|
||||
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
|
||||
latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest)
|
||||
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR
|
||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LIBPOSTAL_S3_KEY
|
||||
lang_class_s3_dir="LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class"
|
||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir
|
||||
fi
|
||||
|
||||
else
|
||||
|
||||
Reference in New Issue
Block a user