Files
libpostal/scripts/geodata/geoplanet/create_geoplanet_db.sh

146 lines
4.1 KiB
Bash
Executable File

#!/usr/bin/env bash
: '
create_geoplanet_db.sh
-------------------------
Shell script to download Geo Planet and derive inputs
for address parser training set construction.
Usage: ./create_geoplanet_db.sh out_dir
'
if [ "$#" -ge 1 ]; then
OUT_DIR=$1
mkdir -p $OUT_DIR
else
OUT_DIR=$(pwd)
fi
GEOPLANET_ZIP_FILE="geoplanet_data_7.10.0.zip"
# Internet Archive URL
GEOPLANET_URL="https://archive.org/download/$GEOPLANET_ZIP_FILE/$GEOPLANET_ZIP_FILE"
GEOPLANET_ORIGINAL_PLACES_FILE="geoplanet_places_7.10.0.tsv"
GEOPLANET_ADMINS_FILE="geoplanet_admins_7.10.0.tsv"
GEOPLANET_ORIGINAL_ALIASES_FILE="geoplanet_aliases_7.10.0.tsv"
GEOPLANET_PLACES_FILE="geoplanet_places.tsv"
GEOPLANET_POSTAL_CODES_FILE="geoplanet_postal_codes.tsv"
GEOPLANET_ALIASES_FILE="geoplanet_aliases.tsv"
GEOPLANET_GEONAMES_CONCORDANCE_FILE="geonames-geoplanet-matches.csv"
GEOPLANET_GEONAMES_CONCORDANCE_URL="https://github.com/blackmad/geoplanet-concordance/raw/master/current/$GEOPLANET_GEONAMES_CONCORDANCE_FILE"
GEOPLANET_DB_FILE="geoplanet.db"
function download_file() {
echo "Downloading $1"
response=$(curl -sL -w "%{http_code}" $1 --retry 3 --retry-delay 5 -o $OUT_DIR/$2)
if [ $response -ne "200" ]; then
echo "Could not download $GEOPLANET_URL"
exit 1
fi
}
if [ ! -f $OUT_DIR/$GEOPLANET_ZIP_FILE ]; then
echo "Downloading GeoPlanet"
download_file $GEOPLANET_URL $GEOPLANET_ZIP_FILE
fi
cd $OUT_DIR
echo "Unzipping GeoPlanet file"
unzip -o $GEOPLANET_ZIP_FILE
echo "Creating GeoPlanet postal codes file"
awk -F'\t' 'BEGIN{OFS="\t";} {if ($5 == "Zip") print $0;}' $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_POSTAL_CODES_FILE
echo "Creating GeoPlanet places file"
awk -F'\t' 'BEGIN{OFS="\t";} {if ($5 == "Continent" || $5 == "Country" || $5 == "Nationality" || $5 == "State" || $5 == "County" || $5 == "Town" || $5 == "LocalAdmin" || $5 == "Island" || $5 == "Suburb") print $0;}' $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_PLACES_FILE
echo "Creating GeoPlanet aliases file"
tail +2 $GEOPLANET_ORIGINAL_ALIASES_FILE > $GEOPLANET_ALIASES_FILE
echo "Fetching GeoNames concordance"
download_file $GEOPLANET_GEONAMES_CONCORDANCE_URL $GEOPLANET_GEONAMES_CONCORDANCE_FILE
echo "Creating SQLite db"
echo "
DROP TABLE IF EXISTS places;
CREATE TABLE places (
id integer primary key,
country_code text,
name text,
language text,
place_type text,
parent_id integer
);
.separator \t
.import $OUT_DIR/$GEOPLANET_PLACES_FILE places
CREATE INDEX places_parent_id_index on places(parent_id);
CREATE INDEX places_country_code on places(country_code);
DROP TABLE IF EXISTS postal_codes;
CREATE TABLE postal_codes (
id integer primary key,
country_code text,
name text,
language text,
place_type text,
parent_id integer
);
.import $GEOPLANET_POSTAL_CODES_FILE postal_codes
CREATE INDEX postal_codes_parent_id_index on postal_codes(parent_id);
CREATE INDEX postal_codes_country_code on postal_codes(country_code);
DROP TABLE IF EXISTS admins;
CREATE TABLE admins (
id integer primary key,
country_code text,
state_id integer,
county_id integer,
local_admin_id integer,
country_id integer,
continent_id integer
);
.import $GEOPLANET_ADMINS_FILE admins
CREATE INDEX admin_country_code on admins(country_code);
CREATE INDEX admin_state_id on admins(state_id);
CREATE INDEX admin_county_id on admins(county_id);
CREATE INDEX admin_local_admin_id on admins(local_admin_id);
CREATE INDEX admin_country_id on admins(country_id);
CREATE INDEX admin_continent_id on admins(continent_id);
DROP TABLE IF EXISTS aliases;
CREATE TABLE aliases (
id integer,
name text,
name_type text,
language text
);
.import $GEOPLANET_ALIASES_FILE aliases
CREATE INDEX alias_id on aliases(id);
DROP TABLE IF EXISTS geonames_concordance;
CREATE TABLE geonames_concordance (
id integer primary key,
geonames_id integer,
name text,
lat number,
lon number
);
.mode csv
.import $GEOPLANET_GEONAMES_CONCORDANCE_FILE geonames_concordance
CREATE INDEX geonames_concordance_geonames_id on geonames_concordance(geonames_id);
" | sqlite3 $OUT_DIR/$GEOPLANET_DB_FILE