[data/models] updating libpostal download script to download new models. The simple data files are stored by libpostal major version, whereas the models are stored by the version of the training data they used. A file called "latest" is stored in S3 to indicate the latest version of the model and checked on make
This commit is contained in:
@@ -7,6 +7,8 @@ if [ "$#" -lt 3 ]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
LIBPOSTAL_VERSION_STRING="v1"
|
||||||
|
|
||||||
LIBPOSTAL_S3_BUCKET_NAME="libpostal"
|
LIBPOSTAL_S3_BUCKET_NAME="libpostal"
|
||||||
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"
|
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"
|
||||||
LIBPOSTAL_S3_BUCKET_URL="http://$LIBPOSTAL_S3_BUCKET_NAME.s3.amazonaws.com"
|
LIBPOSTAL_S3_BUCKET_URL="http://$LIBPOSTAL_S3_BUCKET_NAME.s3.amazonaws.com"
|
||||||
@@ -14,6 +16,10 @@ LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
|
|||||||
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
|
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
|
||||||
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
|
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
|
||||||
|
|
||||||
|
LIBPOSTAL_DATA_S3_PREFIX="data/$LIBPOSTAL_VERSION_STRING"
|
||||||
|
LIBPOSTAL_PARSER_S3_PREFIX="models/address_parser"
|
||||||
|
LIBPOSTAL_LANG_CLASS_S3_PREFIX="models/language_classifier"
|
||||||
|
|
||||||
COMMAND=$1
|
COMMAND=$1
|
||||||
FILE=$2
|
FILE=$2
|
||||||
LIBPOSTAL_DATA_DIR=$3
|
LIBPOSTAL_DATA_DIR=$3
|
||||||
@@ -89,8 +95,11 @@ download_multipart() {
|
|||||||
download_file() {
|
download_file() {
|
||||||
updated_path=$1
|
updated_path=$1
|
||||||
data_dir=$2
|
data_dir=$2
|
||||||
filename=$3
|
prefix=$3
|
||||||
name=$4
|
filename=$4
|
||||||
|
name=$5
|
||||||
|
shift 5
|
||||||
|
subdirs=$@
|
||||||
|
|
||||||
local_path=$data_dir/$filename
|
local_path=$data_dir/$filename
|
||||||
|
|
||||||
@@ -100,7 +109,7 @@ download_file() {
|
|||||||
|
|
||||||
echo "Checking for new libpostal $name..."
|
echo "Checking for new libpostal $name..."
|
||||||
|
|
||||||
url=$LIBPOSTAL_S3_BUCKET_URL/$filename
|
url=$LIBPOSTAL_S3_BUCKET_URL/$prefix/$filename
|
||||||
|
|
||||||
if [ $(curl -sI $url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then
|
if [ $(curl -sI $url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then
|
||||||
echo "New libpostal $name available"
|
echo "New libpostal $name available"
|
||||||
@@ -117,6 +126,9 @@ download_file() {
|
|||||||
elif stat -f %Sm . >/dev/null 2>&1; then
|
elif stat -f %Sm . >/dev/null 2>&1; then
|
||||||
echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path;
|
echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path;
|
||||||
fi;
|
fi;
|
||||||
|
for subdir in $subdirs; do
|
||||||
|
rm -rf $data_dir/$subdir;
|
||||||
|
done
|
||||||
tar -xvzf $local_path -C $data_dir;
|
tar -xvzf $local_path -C $data_dir;
|
||||||
rm $local_path;
|
rm $local_path;
|
||||||
else
|
else
|
||||||
@@ -128,32 +140,38 @@ if [ $COMMAND = "download" ]; then
|
|||||||
mkdir -p $LIBPOSTAL_DATA_DIR
|
mkdir -p $LIBPOSTAL_DATA_DIR
|
||||||
|
|
||||||
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
|
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
|
||||||
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_FILE "data file"
|
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_S3_PREFIX $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS
|
||||||
fi
|
fi
|
||||||
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
||||||
latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/models/address_parser/latest)
|
latest_parser=$(curl --silent "$LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest")
|
||||||
parser_filename="models/address_parser/$latest_parser/$LIBPOSTAL_PARSER_FILE"
|
parser_s3_prefix="$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser"
|
||||||
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $parser_filename "parser data file"
|
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $parser_s3_prefix $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR
|
||||||
fi
|
fi
|
||||||
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
|
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
|
||||||
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file"
|
latest_lang_class=$(curl --silent "$LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest")
|
||||||
|
lang_class_s3_prefix="$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class"
|
||||||
|
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $lang_class_s3_prefix $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
|
||||||
fi
|
fi
|
||||||
|
|
||||||
elif [ $COMMAND = "upload" ]; then
|
elif [ $COMMAND = "upload" ]; then
|
||||||
|
|
||||||
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
|
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
|
||||||
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS
|
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS
|
||||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY
|
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_DATA_S3_PREFIX
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
||||||
|
latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest)
|
||||||
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR
|
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR
|
||||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $LIBPOSTAL_S3_KEY
|
parser_s3_dir="$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/"
|
||||||
|
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
|
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
|
||||||
|
latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest)
|
||||||
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR
|
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR
|
||||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LIBPOSTAL_S3_KEY
|
lang_class_s3_dir="LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class"
|
||||||
|
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir
|
||||||
fi
|
fi
|
||||||
|
|
||||||
else
|
else
|
||||||
|
|||||||
Reference in New Issue
Block a user