Merge pull request #328 from openvenues/github_model_releases
GitHub model releases
This commit is contained in:
@@ -440,12 +440,12 @@ Training data
|
|||||||
|
|
||||||
In machine learning, large amounts of training data are often essential for getting good results. Many open-source machine learning projects either release only the model code (results reproducible if and only if you're Google), or a pre-baked model where the training conditions are unknown.
|
In machine learning, large amounts of training data are often essential for getting good results. Many open-source machine learning projects either release only the model code (results reproducible if and only if you're Google), or a pre-baked model where the training conditions are unknown.
|
||||||
|
|
||||||
Libpostal is a bit different because it's trained on open data that's available to everyone, so we've released the entire training pipeline (the [geodata](https://github.com/openvenues/libpostal/tree/master/scripts/geodata) package in this repo), as well as the resulting training data itself on S3. It's over 100GB unzipped.
|
Libpostal is a bit different because it's trained on open data that's available to everyone, so we've released the entire training pipeline (the [geodata](https://github.com/openvenues/libpostal/tree/master/scripts/geodata) package in this repo), as well as the resulting training data itself on the Internet Archive. It's over 100GB unzipped.
|
||||||
|
|
||||||
Training data are stored on S3 by the date they were created. There's also a file stored on S3 to point to the most recent training data. To always point to the latest data, use something like: ```latest=$(curl https://s3.amazonaws.com/libpostal/training_data/latest)``` and use that variable in place of the date.
|
Training data are stored on archive.org by the date they were created. There's also a file stored in the main directory of this repo called `current_parser_training_set` which stores the date of the most recently created training set. To always point to the latest data, try something like: ```latest=$(cat current_parser_training_set)``` and use that variable in place of the date.
|
||||||
|
|
||||||
### Parser training sets ###
|
### Parser training sets ###
|
||||||
All files can be found at https://d1p366rbd94x8u.cloudfront.net/training_data/$YYYY-MM-DD/parser/$FILE as gzip'd tab-separated values (TSV) files formatted like:```language\tcountry\taddress```.
|
All files can be found at https://archive.org/download/libpostal-parser-training-data-YYYYMMDD/$FILE as gzip'd tab-separated values (TSV) files formatted like:```language\tcountry\taddress```.
|
||||||
|
|
||||||
- **formatted_addresses_tagged.random.tsv.gz** (ODBL): OSM addresses. Apartments, PO boxes, categories, etc. are added primarily to these examples
|
- **formatted_addresses_tagged.random.tsv.gz** (ODBL): OSM addresses. Apartments, PO boxes, categories, etc. are added primarily to these examples
|
||||||
- **formatted_places_tagged.random.tsv.gz** (ODBL): every toponym in OSM (even cities represented as points, etc.), reverse-geocoded to its parent admins, possibly including postal codes if they're listed on the point/polygon. Every place gets a base level of representation and places with higher populations get proportionally more.
|
- **formatted_places_tagged.random.tsv.gz** (ODBL): every toponym in OSM (even cities represented as points, etc.), reverse-geocoded to its parent admins, possibly including postal codes if they're listed on the point/polygon. Every place gets a base level of representation and places with higher populations get proportionally more.
|
||||||
|
|||||||
1
current_parser_training_set
Normal file
1
current_parser_training_set
Normal file
@@ -0,0 +1 @@
|
|||||||
|
20170304
|
||||||
@@ -3,23 +3,27 @@
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
if [ "$#" -lt 3 ]; then
|
if [ "$#" -lt 3 ]; then
|
||||||
echo "Usage: ./libpostal_data [upload|download] [base|geodb] data_dir"
|
echo "Usage: ./libpostal_data [upload|download] [base|parser|language_classifier|all] data_dir"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
LIBPOSTAL_VERSION_STRING="v1"
|
LIBPOSTAL_VERSION_STRING="v1"
|
||||||
|
LIBPOSTAL_RELEASE_VERSION_STRING="v1.0.0"
|
||||||
|
|
||||||
|
LIBPOSTAL_REPO_NAME="openvenues/libpostal"
|
||||||
LIBPOSTAL_S3_BUCKET_NAME="libpostal"
|
LIBPOSTAL_S3_BUCKET_NAME="libpostal"
|
||||||
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"
|
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"
|
||||||
LIBPOSTAL_S3_BUCKET_URL="https://$LIBPOSTAL_S3_BUCKET_NAME.s3.amazonaws.com"
|
|
||||||
LIBPOSTAL_CLOUDFRONT_URL="https://d1p366rbd94x8u.cloudfront.net"
|
GITHUB_API_URL="https://api.github.com"
|
||||||
|
LIBPOSTAL_RELEASE_API_URL="$GITHUB_API_URL/repos/$LIBPOSTAL_REPO_NAME/releases"
|
||||||
|
|
||||||
LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
|
LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
|
||||||
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
|
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
|
||||||
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
|
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"
|
||||||
|
|
||||||
LIBPOSTAL_DATA_S3_PREFIX="data/$LIBPOSTAL_VERSION_STRING"
|
LIBPOSTAL_DATA_S3_PREFIX="$LIBPOSTAL_LATEST_MODEL_VERSION_STRING/parser.tar.gz"
|
||||||
LIBPOSTAL_PARSER_S3_PREFIX="models/address_parser"
|
LIBPOSTAL_PARSER_S3_PREFIX="$LIBPOSTAL_LATEST_DATA_VERSION_STRING/libpostal_data.tar.gz"
|
||||||
LIBPOSTAL_LANG_CLASS_S3_PREFIX="models/language_classifier"
|
LIBPOSTAL_LANG_CLASS_S3_PREFIX="$LIBPOSTAL_LATEST_MODEL_VERSION_STRING/language_classifier.tar.gz"
|
||||||
|
|
||||||
COMMAND=$1
|
COMMAND=$1
|
||||||
FILE=$2
|
FILE=$2
|
||||||
@@ -58,7 +62,7 @@ kill_background_processes() {
|
|||||||
trap kill_background_processes INT
|
trap kill_background_processes INT
|
||||||
|
|
||||||
PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"'
|
PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"'
|
||||||
PART_CURL='curl $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5'
|
PART_CURL='curl -L $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5'
|
||||||
DOWNLOAD_PART="$PART_MSG;$PART_CURL"
|
DOWNLOAD_PART="$PART_MSG;$PART_CURL"
|
||||||
|
|
||||||
|
|
||||||
@@ -99,10 +103,12 @@ download_multipart() {
|
|||||||
download_file() {
|
download_file() {
|
||||||
updated_path=$1
|
updated_path=$1
|
||||||
data_dir=$2
|
data_dir=$2
|
||||||
prefix=$3
|
metadata_url=$3
|
||||||
filename=$4
|
url=$4
|
||||||
name=$5
|
size=$5
|
||||||
shift 5
|
filename=$6
|
||||||
|
name=$7
|
||||||
|
shift 7
|
||||||
subdirs=$@
|
subdirs=$@
|
||||||
|
|
||||||
local_path=$data_dir/$filename
|
local_path=$data_dir/$filename
|
||||||
@@ -113,16 +119,13 @@ download_file() {
|
|||||||
|
|
||||||
echo "Checking for new libpostal $name..."
|
echo "Checking for new libpostal $name..."
|
||||||
|
|
||||||
url=$LIBPOSTAL_CLOUDFRONT_URL/$prefix/$filename
|
if [ $(curl -LsI $metadata_url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then
|
||||||
|
|
||||||
if [ $(curl -sI $url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then
|
|
||||||
echo "New libpostal $name available"
|
echo "New libpostal $name available"
|
||||||
content_length=$(curl -I $url 2> /dev/null | awk 'tolower($0) ~ /^content-length:/ { print $2 }' | tr -d '[[:space:]]')
|
|
||||||
|
|
||||||
if [ $content_length -ge $LARGE_FILE_SIZE ]; then
|
if [ $size -ge $LARGE_FILE_SIZE ]; then
|
||||||
download_multipart $url $local_path $content_length
|
download_multipart $url $local_path $size
|
||||||
else
|
else
|
||||||
curl $url --retry 3 --retry-delay 2 -o $local_path
|
curl -L $url --retry 3 --retry-delay 2 -o $local_path
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if date -ud "@$(date -ur . +%s)" >/dev/null 2>&1; then
|
if date -ud "@$(date -ur . +%s)" >/dev/null 2>&1; then
|
||||||
@@ -163,45 +166,66 @@ if [ $COMMAND = "download" ]; then
|
|||||||
|
|
||||||
mkdir -p $LIBPOSTAL_DATA_DIR
|
mkdir -p $LIBPOSTAL_DATA_DIR
|
||||||
|
|
||||||
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
|
release_id=$(curl -s $LIBPOSTAL_RELEASE_API_URL/tags/$LIBPOSTAL_RELEASE_VERSION_STRING | grep "\"id\"" | head -n1 | grep -o '[0-9][0-9]*')
|
||||||
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $LIBPOSTAL_DATA_S3_PREFIX $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS
|
release_assets="$(curl -s $LIBPOSTAL_RELEASE_API_URL/$release_id/assets)"
|
||||||
fi
|
|
||||||
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
|
||||||
latest_parser=$(curl --silent "$LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest")
|
|
||||||
parser_s3_prefix="$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser"
|
|
||||||
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $parser_s3_prefix $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR
|
|
||||||
fi
|
|
||||||
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
|
|
||||||
latest_lang_class=$(curl --silent "$LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest")
|
|
||||||
lang_class_s3_prefix="$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class"
|
|
||||||
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $lang_class_s3_prefix $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then
|
asset_names_tempfile="$LIBPOSTAL_DATA_DIR/asset_names.tmp"
|
||||||
echo $LIBPOSTAL_VERSION_STRING > $LIBPOSTAL_DATA_VERSION_FILE;
|
echo "$release_assets" | grep -o '"name": *"[^"]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_names_tempfile
|
||||||
fi
|
asset_metadata_tempfile="$LIBPOSTAL_DATA_DIR/asset_metadata.tmp"
|
||||||
|
echo "$release_assets" | grep -o '"url": *"[^"]*/releases/assets/[0-9]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_metadata_tempfile
|
||||||
|
asset_urls_tempfile="$LIBPOSTAL_DATA_DIR/asset_urls.tmp"
|
||||||
|
echo "$release_assets" | grep -o '"browser_download_url": *"[^"]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_urls_tempfile
|
||||||
|
asset_sizes_tempfile="$LIBPOSTAL_DATA_DIR/asset_sizes.tmp"
|
||||||
|
echo "$release_assets" | grep -o '"size": *[0-9]*' | grep -o '[0-9]*$' > $asset_sizes_tempfile
|
||||||
|
|
||||||
|
assets_tempfile="$LIBPOSTAL_DATA_DIR/assets.tmp"
|
||||||
|
paste -d' ' $asset_names_tempfile $asset_metadata_tempfile $asset_urls_tempfile $asset_sizes_tempfile > $assets_tempfile
|
||||||
|
|
||||||
|
rm $asset_names_tempfile $asset_metadata_tempfile $asset_urls_tempfile $asset_sizes_tempfile
|
||||||
|
|
||||||
|
while read -r line; do
|
||||||
|
asset=$(echo "$line" | cut -f1 -d' ')
|
||||||
|
asset_metadata_url=$(echo "$line" | cut -f2 -d' ')
|
||||||
|
asset_url=$(echo "$line" | cut -f3 -d' ')
|
||||||
|
asset_size=$(echo "$line" | cut -f4 -d' ')
|
||||||
|
|
||||||
|
if [ $asset = $LIBPOSTAL_DATA_FILE ] && ([ $FILE = "base" ] || [ $FILE = "all" ]); then
|
||||||
|
download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS
|
||||||
|
fi
|
||||||
|
if [ $asset = $LIBPOSTAL_PARSER_FILE ] && ([ $FILE = "parser" ] || [ $FILE = "all" ]); then
|
||||||
|
download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR
|
||||||
|
fi
|
||||||
|
if [ $asset = $LIBPOSTAL_LANG_CLASS_FILE ] && ([ $FILE = "language_classifier" ] || [ $FILE = "all" ]); then
|
||||||
|
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then
|
||||||
|
echo $LIBPOSTAL_VERSION_STRING > $LIBPOSTAL_DATA_VERSION_FILE;
|
||||||
|
fi
|
||||||
|
done < $assets_tempfile;
|
||||||
|
rm $assets_tempfile
|
||||||
|
|
||||||
elif [ $COMMAND = "upload" ]; then
|
elif [ $COMMAND = "upload" ]; then
|
||||||
|
echo "upload not implemented yet"
|
||||||
|
|
||||||
if [ $FILE = "base" ] || [ $FILE = "all" ]; then
|
#if [ $FILE = "base" ] || [ $FILE = "all" ]; then
|
||||||
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS
|
# tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS
|
||||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY/$LIBPOSTAL_DATA_S3_PREFIX/
|
# aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY/$LIBPOSTAL_DATA_S3_PREFIX/
|
||||||
fi
|
#fi
|
||||||
|
|
||||||
if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
#if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
|
||||||
latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest)
|
# latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest)
|
||||||
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR
|
# tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR
|
||||||
parser_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/"
|
# parser_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/"
|
||||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir
|
# aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir
|
||||||
fi
|
#fi
|
||||||
|
|
||||||
if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
|
|
||||||
latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest)
|
|
||||||
tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR
|
|
||||||
lang_class_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class/"
|
|
||||||
aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
#if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
|
||||||
|
# latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest)
|
||||||
|
# tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR
|
||||||
|
# lang_class_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class/"
|
||||||
|
# aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir
|
||||||
|
#fi
|
||||||
else
|
else
|
||||||
echo "Invalid command: $COMMAND"
|
echo "Invalid command: $COMMAND"
|
||||||
exit 1
|
exit 1
|
||||||
|
|||||||
Reference in New Issue
Block a user