[geoplanet] Script to download GeoPlanet directly and load into a SQLite db for extracting the postal codes (admin areas need cleanupi in a number of countries, that's next)
This commit is contained in:
142
scripts/geodata/geoplanet/create_geoplanet_db.sh
Executable file
142
scripts/geodata/geoplanet/create_geoplanet_db.sh
Executable file
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
: '
|
||||
fetch_geoplanet.sh
|
||||
-------------------------
|
||||
|
||||
Shell script to download Geo Planet and derive inputs
|
||||
for address parser training set construction.
|
||||
|
||||
Usage: ./fetch_geoplanet.sh out_dir
|
||||
'
|
||||
|
||||
if [ "$#" -ge 1 ]; then
|
||||
OUT_DIR=$1
|
||||
mkdir -p $OUT_DIR
|
||||
else
|
||||
OUT_DIR=$(pwd)
|
||||
fi
|
||||
|
||||
GEOPLANET_ZIP_FILE="geoplanet_data_7.10.0.zip"
|
||||
# Internet Archive URL
|
||||
GEOPLANET_URL="https://archive.org/download/$GEOPLANET_ZIP_FILE/$GEOPLANET_ZIP_FILE"
|
||||
GEOPLANET_ORIGINAL_PLACES_FILE="geoplanet_places_7.10.0.tsv"
|
||||
GEOPLANET_ADMINS_FILE="geoplanet_admins_7.10.0.tsv"
|
||||
GEOPLANET_ALIASES_FILE="geoplanet_aliases_7.10.0.tsv"
|
||||
|
||||
GEOPLANET_PLACES_FILE="geoplanet_places.tsv"
|
||||
GEOPLANET_POSTAL_CODES_FILE="geoplanet_postal_codes.tsv"
|
||||
|
||||
GEOPLANET_GEONAMES_CONCORDANCE_FILE="geonames-geoplanet-matches.csv"
|
||||
GEOPLANET_GEONAMES_CONCORDANCE_URL="https://github.com/blackmad/geoplanet-concordance/raw/master/current/$GEOPLANET_GEONAMES_CONCORDANCE_FILE"
|
||||
|
||||
GEOPLANET_DB_FILE="geoplanet.db"
|
||||
|
||||
function download_file() {
|
||||
echo "Downloading $1"
|
||||
response=$(curl -sL -w "%{http_code}" $1 --retry 3 --retry-delay 5 -o $OUT_DIR/$2)
|
||||
if [ $response -ne "200" ]; then
|
||||
echo "Could not download $GEOPLANET_URL"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
if [ ! -f $OUT_DIR/$GEOPLANET_ZIP_FILE ]; then
|
||||
echo "Downloading GeoPlanet"
|
||||
download_file $GEOPLANET_URL GEOPLANET_ZIP_FILE
|
||||
fi
|
||||
|
||||
cd $OUT_DIR
|
||||
echo "Unzipping GeoPlanet file"
|
||||
unzip -o $GEOPLANET_ZIP_FILE
|
||||
|
||||
echo "Creating GeoPlanet postal codes file"
|
||||
awk -F'\t' 'BEGIN{OFS="\t";} {if ($5 == "Zip") print $0;}' $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_POSTAL_CODES_FILE
|
||||
|
||||
echo "Creating GeoPlanet places file"
|
||||
awk -F'\t' 'BEGIN{OFS="\t";} {if ($5 == "Continent" || $5 == "Country" || $5 == "Nationality" || $5 == "State" || $5 == "County" || $5 == "Town" || $5 == "LocalAdmin" || $5 == "Suburb") print $0;}' $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_PLACES_FILE
|
||||
|
||||
echo "Fetching GeoNames concordance"
|
||||
download_file $GEOPLANET_GEONAMES_CONCORDANCE_URL $GEOPLANET_GEONAMES_CONCORDANCE_FILE
|
||||
|
||||
echo "Creating SQLite db"
|
||||
|
||||
echo "
|
||||
DROP TABLE IF EXISTS places;
|
||||
CREATE TABLE places (
|
||||
id integer primary key,
|
||||
country_code text,
|
||||
name text,
|
||||
language text,
|
||||
place_type text,
|
||||
parent_id integer
|
||||
);
|
||||
|
||||
.separator \t
|
||||
.import $OUT_DIR/$GEOPLANET_PLACES_FILE places
|
||||
|
||||
CREATE INDEX places_parent_id_index on places(parent_id);
|
||||
CREATE INDEX places_country_code on places(country_code);
|
||||
|
||||
DROP TABLE IF EXISTS postal_codes;
|
||||
CREATE TABLE postal_codes (
|
||||
id integer primary key,
|
||||
country_code text,
|
||||
name text,
|
||||
language text,
|
||||
place_type text,
|
||||
parent_id integer
|
||||
);
|
||||
|
||||
.import $GEOPLANET_POSTAL_CODES_FILE postal_codes
|
||||
CREATE INDEX postal_codes_parent_id_index on postal_codes(parent_id);
|
||||
CREATE INDEX postal_codes_country_code on postal_codes(country_code);
|
||||
|
||||
DROP TABLE IF EXISTS admins;
|
||||
CREATE TABLE admins (
|
||||
id integer primary key,
|
||||
country_code text,
|
||||
state_id integer,
|
||||
county_id integer,
|
||||
local_admin_id integer,
|
||||
country_id integer,
|
||||
continent_id integer
|
||||
);
|
||||
|
||||
.import $GEOPLANET_ADMINS_FILE admins
|
||||
|
||||
CREATE INDEX admin_country_code on admins(country_code);
|
||||
CREATE INDEX admin_state_id on admins(state_id);
|
||||
CREATE INDEX admin_county_id on admins(county_id);
|
||||
CREATE INDEX admin_local_admin_id on admins(local_admin_id);
|
||||
CREATE INDEX admin_country_id on admins(country_id);
|
||||
CREATE INDEX admin_continent_id on admins(continent_id);
|
||||
|
||||
DROP TABLE IF EXISTS aliases;
|
||||
CREATE TABLE aliases (
|
||||
id integer,
|
||||
name text,
|
||||
name_type text,
|
||||
language text
|
||||
);
|
||||
|
||||
.import $GEOPLANET_ALIASES_FILE aliases
|
||||
|
||||
CREATE INDEX alias_id on aliases(id);
|
||||
|
||||
DROP TABLE IF EXISTS geonames_concordance;
|
||||
CREATE TABLE geonames_concordance (
|
||||
id integer primary key,
|
||||
geonames_id integer,
|
||||
name text,
|
||||
lat number,
|
||||
lon number
|
||||
);
|
||||
|
||||
.mode csv
|
||||
.import $GEOPLANET_GEONAMES_CONCORDANCE_FILE geonames_concordance
|
||||
|
||||
CREATE INDEX geonames_concordance_geonames_id on geonames_concordance(geonames_id);
|
||||
|
||||
" | sqlite3 $OUT_DIR/$GEOPLANET_DB_FILE
|
||||
Reference in New Issue
Block a user