From ef94f1b7124c665f61202bba355e2b7ee1568039 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 19 Oct 2015 15:39:31 -0400 Subject: [PATCH] [doc] Adding some comments to fetch_osm_address_data.sh --- scripts/geodata/osm/fetch_osm_address_data.sh | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/scripts/geodata/osm/fetch_osm_address_data.sh b/scripts/geodata/osm/fetch_osm_address_data.sh index 9a9aaa14..636c7c95 100755 --- a/scripts/geodata/osm/fetch_osm_address_data.sh +++ b/scripts/geodata/osm/fetch_osm_address_data.sh @@ -1,11 +1,21 @@ #!/usr/bin/env bash +: ' +fetch_osm_address_data.sh +------------------------- + +Shell script to download OSM planet and derive inputs +for language detection and address parser training set +construction. +' + if [ "$#" -eq 1 ]; then OUT_DIR=$1 else OUT_DIR=`pwd` fi +# Check for osmfilter and osmconvert if ! type -P osmfilter osmconvert > /dev/null; then cat << EOF ERROR: osmfilter and osmconvert are required @@ -24,6 +34,8 @@ PREV_DIR=`pwd` cd $OUT_DIR +# Download planet as PBF +# TODO: currently uses single mirror, randomly choose one instead echo "Started OSM download: `date`" wget http://ftp5.gwdg.de/pub/misc/openstreetmap/planet.openstreetmap.org/pbf/planet-latest.osm.pbf @@ -31,8 +43,11 @@ echo "Converting to o5m: `date`" PLANET_PBF="planet-latest.osm.pbf" PLANET_O5M="planet-latest.o5m" +# Needs to be in O5M for some of the subsequent steps to work whereas PBF is smaller for download osmconvert $PLANET_PBF -o=$PLANET_O5M rm $PLANET_PBF + +# Address data set for use in parser, language detection echo "Filtering for records with address tags: `date`" PLANET_ADDRESSES_O5M="planet-addresses.o5m" osmfilter $PLANET_O5M --keep="addr:street= and ( ( name= and amenity= ) or addr:housename= or addr:housenumber= )" --drop-author --drop-version -o=$PLANET_ADDRESSES_O5M @@ -43,6 +58,7 @@ PLANET_ADDRESSES="planet-addresses.osm" osmfilter $PLANET_ADDRESSES_LATLONS --keep="addr:street= and ( ( name= and amenity= ) or addr:housename= or addr:housenumber= )" -o=$PLANET_ADDRESSES rm $PLANET_ADDRESSES_LATLONS +# Border data set for use in R-tree index/reverse geocoding, parsing, language detection echo " Filtering for borders: `date`" PLANET_BORDERS_O5M="planet-borders.o5m" PLANET_BORDERS="planet-borders.osm" @@ -55,7 +71,7 @@ rm $PLANET_BORDERS_O5M osmfilter $PLANET_BORDERS_LATLONS --keep="boundary=administrative or place=city or place=town or place=neighbourhood or place=suburb" -o=$PLANET_BORDERS rm $PLANET_BORDERS_LATLONS - +# Venue data set for use in venue classification echo "Filtering for venue records: `date`" PLANET_VENUES_O5M="planet-venues.o5m" osmfilter $PLANET_O5M --keep="name= and ( amenity= or building= )" --drop-author --drop-version -o=$PLANET_VENUES_O5M @@ -66,12 +82,11 @@ PLANET_VENUES="planet-venues.osm" osmfilter $PLANET_VENUES_LATLONS --keep="name= and ( amenity= or building= )" -o=$PLANET_VENUES rm $PLANET_VENUES_LATLONS +# Streets data set for use in language classification echo "Filtering ways: `date`" PLANET_WAYS_O5M="planet-ways.o5m" osmfilter planet-latest.o5m --keep="name= and highway=" --drop-relations --drop-author --drop-version -o=$PLANET_WAYS_O5M rm $PLANET_O5M - -echo "Extracting ways: `date`" PLANET_WAYS_NODES_LATLON="planet-ways-nodes-latlons.o5m" osmconvert $PLANET_WAYS_O5M --max-objects=1000000000 --all-to-nodes -o=$PLANET_WAYS_NODES_LATLON # 10^15 is the offset used for ways and relations with --all-to-ndoes, extracts just the ways