diff --git a/scripts/geodata/geoplanet/create_geoplanet_db.sh b/scripts/geodata/geoplanet/create_geoplanet_db.sh new file mode 100755 index 00000000..0f4b18fc --- /dev/null +++ b/scripts/geodata/geoplanet/create_geoplanet_db.sh @@ -0,0 +1,142 @@ +#!/usr/bin/env bash + +: ' +fetch_geoplanet.sh +------------------------- + +Shell script to download Geo Planet and derive inputs +for address parser training set construction. + +Usage: ./fetch_geoplanet.sh out_dir +' + +if [ "$#" -ge 1 ]; then + OUT_DIR=$1 + mkdir -p $OUT_DIR +else + OUT_DIR=$(pwd) +fi + +GEOPLANET_ZIP_FILE="geoplanet_data_7.10.0.zip" +# Internet Archive URL +GEOPLANET_URL="https://archive.org/download/$GEOPLANET_ZIP_FILE/$GEOPLANET_ZIP_FILE" +GEOPLANET_ORIGINAL_PLACES_FILE="geoplanet_places_7.10.0.tsv" +GEOPLANET_ADMINS_FILE="geoplanet_admins_7.10.0.tsv" +GEOPLANET_ALIASES_FILE="geoplanet_aliases_7.10.0.tsv" + +GEOPLANET_PLACES_FILE="geoplanet_places.tsv" +GEOPLANET_POSTAL_CODES_FILE="geoplanet_postal_codes.tsv" + +GEOPLANET_GEONAMES_CONCORDANCE_FILE="geonames-geoplanet-matches.csv" +GEOPLANET_GEONAMES_CONCORDANCE_URL="https://github.com/blackmad/geoplanet-concordance/raw/master/current/$GEOPLANET_GEONAMES_CONCORDANCE_FILE" + +GEOPLANET_DB_FILE="geoplanet.db" + +function download_file() { + echo "Downloading $1" + response=$(curl -sL -w "%{http_code}" $1 --retry 3 --retry-delay 5 -o $OUT_DIR/$2) + if [ $response -ne "200" ]; then + echo "Could not download $GEOPLANET_URL" + exit 1 + fi +} + + +if [ ! -f $OUT_DIR/$GEOPLANET_ZIP_FILE ]; then + echo "Downloading GeoPlanet" + download_file $GEOPLANET_URL GEOPLANET_ZIP_FILE +fi + +cd $OUT_DIR +echo "Unzipping GeoPlanet file" +unzip -o $GEOPLANET_ZIP_FILE + +echo "Creating GeoPlanet postal codes file" +awk -F'\t' 'BEGIN{OFS="\t";} {if ($5 == "Zip") print $0;}' $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_POSTAL_CODES_FILE + +echo "Creating GeoPlanet places file" +awk -F'\t' 'BEGIN{OFS="\t";} {if ($5 == "Continent" || $5 == "Country" || $5 == "Nationality" || $5 == "State" || $5 == "County" || $5 == "Town" || $5 == "LocalAdmin" || $5 == "Suburb") print $0;}' $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_PLACES_FILE + +echo "Fetching GeoNames concordance" +download_file $GEOPLANET_GEONAMES_CONCORDANCE_URL $GEOPLANET_GEONAMES_CONCORDANCE_FILE + +echo "Creating SQLite db" + +echo " +DROP TABLE IF EXISTS places; +CREATE TABLE places ( + id integer primary key, + country_code text, + name text, + language text, + place_type text, + parent_id integer +); + +.separator \t +.import $OUT_DIR/$GEOPLANET_PLACES_FILE places + +CREATE INDEX places_parent_id_index on places(parent_id); +CREATE INDEX places_country_code on places(country_code); + +DROP TABLE IF EXISTS postal_codes; +CREATE TABLE postal_codes ( + id integer primary key, + country_code text, + name text, + language text, + place_type text, + parent_id integer +); + +.import $GEOPLANET_POSTAL_CODES_FILE postal_codes +CREATE INDEX postal_codes_parent_id_index on postal_codes(parent_id); +CREATE INDEX postal_codes_country_code on postal_codes(country_code); + +DROP TABLE IF EXISTS admins; +CREATE TABLE admins ( + id integer primary key, + country_code text, + state_id integer, + county_id integer, + local_admin_id integer, + country_id integer, + continent_id integer +); + +.import $GEOPLANET_ADMINS_FILE admins + +CREATE INDEX admin_country_code on admins(country_code); +CREATE INDEX admin_state_id on admins(state_id); +CREATE INDEX admin_county_id on admins(county_id); +CREATE INDEX admin_local_admin_id on admins(local_admin_id); +CREATE INDEX admin_country_id on admins(country_id); +CREATE INDEX admin_continent_id on admins(continent_id); + +DROP TABLE IF EXISTS aliases; +CREATE TABLE aliases ( + id integer, + name text, + name_type text, + language text +); + +.import $GEOPLANET_ALIASES_FILE aliases + +CREATE INDEX alias_id on aliases(id); + +DROP TABLE IF EXISTS geonames_concordance; +CREATE TABLE geonames_concordance ( + id integer primary key, + geonames_id integer, + name text, + lat number, + lon number +); + +.mode csv +.import $GEOPLANET_GEONAMES_CONCORDANCE_FILE geonames_concordance + +CREATE INDEX geonames_concordance_geonames_id on geonames_concordance(geonames_id); + +" | sqlite3 $OUT_DIR/$GEOPLANET_DB_FILE \ No newline at end of file