146 lines
4.1 KiB
Bash
Executable File
146 lines
4.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
: '
|
|
create_geoplanet_db.sh
|
|
-------------------------
|
|
|
|
Shell script to download Geo Planet and derive inputs
|
|
for address parser training set construction.
|
|
|
|
Usage: ./create_geoplanet_db.sh out_dir
|
|
'
|
|
|
|
if [ "$#" -ge 1 ]; then
|
|
OUT_DIR=$1
|
|
mkdir -p $OUT_DIR
|
|
else
|
|
OUT_DIR=$(pwd)
|
|
fi
|
|
|
|
GEOPLANET_ZIP_FILE="geoplanet_data_7.10.0.zip"
|
|
# Internet Archive URL
|
|
GEOPLANET_URL="https://archive.org/download/$GEOPLANET_ZIP_FILE/$GEOPLANET_ZIP_FILE"
|
|
GEOPLANET_ORIGINAL_PLACES_FILE="geoplanet_places_7.10.0.tsv"
|
|
GEOPLANET_ADMINS_FILE="geoplanet_admins_7.10.0.tsv"
|
|
GEOPLANET_ORIGINAL_ALIASES_FILE="geoplanet_aliases_7.10.0.tsv"
|
|
|
|
GEOPLANET_PLACES_FILE="geoplanet_places.tsv"
|
|
GEOPLANET_POSTAL_CODES_FILE="geoplanet_postal_codes.tsv"
|
|
GEOPLANET_ALIASES_FILE="geoplanet_aliases.tsv"
|
|
|
|
GEOPLANET_GEONAMES_CONCORDANCE_FILE="geonames-geoplanet-matches.csv"
|
|
GEOPLANET_GEONAMES_CONCORDANCE_URL="https://github.com/blackmad/geoplanet-concordance/raw/master/current/$GEOPLANET_GEONAMES_CONCORDANCE_FILE"
|
|
|
|
GEOPLANET_DB_FILE="geoplanet.db"
|
|
|
|
function download_file() {
|
|
echo "Downloading $1"
|
|
response=$(curl -sL -w "%{http_code}" $1 --retry 3 --retry-delay 5 -o $OUT_DIR/$2)
|
|
if [ $response -ne "200" ]; then
|
|
echo "Could not download $GEOPLANET_URL"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
|
|
if [ ! -f $OUT_DIR/$GEOPLANET_ZIP_FILE ]; then
|
|
echo "Downloading GeoPlanet"
|
|
download_file $GEOPLANET_URL $GEOPLANET_ZIP_FILE
|
|
fi
|
|
|
|
cd $OUT_DIR
|
|
echo "Unzipping GeoPlanet file"
|
|
unzip -o $GEOPLANET_ZIP_FILE
|
|
|
|
echo "Creating GeoPlanet postal codes file"
|
|
awk -F'\t' 'BEGIN{OFS="\t";} {if ($5 == "Zip") print $0;}' $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_POSTAL_CODES_FILE
|
|
|
|
echo "Creating GeoPlanet places file"
|
|
awk -F'\t' 'BEGIN{OFS="\t";} {if ($5 == "Continent" || $5 == "Country" || $5 == "Nationality" || $5 == "State" || $5 == "County" || $5 == "Town" || $5 == "LocalAdmin" || $5 == "Island" || $5 == "Suburb") print $0;}' $GEOPLANET_ORIGINAL_PLACES_FILE > $GEOPLANET_PLACES_FILE
|
|
|
|
echo "Creating GeoPlanet aliases file"
|
|
tail +2 $GEOPLANET_ORIGINAL_ALIASES_FILE > $GEOPLANET_ALIASES_FILE
|
|
|
|
echo "Fetching GeoNames concordance"
|
|
download_file $GEOPLANET_GEONAMES_CONCORDANCE_URL $GEOPLANET_GEONAMES_CONCORDANCE_FILE
|
|
|
|
echo "Creating SQLite db"
|
|
|
|
echo "
|
|
DROP TABLE IF EXISTS places;
|
|
CREATE TABLE places (
|
|
id integer primary key,
|
|
country_code text,
|
|
name text,
|
|
language text,
|
|
place_type text,
|
|
parent_id integer
|
|
);
|
|
|
|
.separator \t
|
|
.import $OUT_DIR/$GEOPLANET_PLACES_FILE places
|
|
|
|
CREATE INDEX places_parent_id_index on places(parent_id);
|
|
CREATE INDEX places_country_code on places(country_code);
|
|
|
|
DROP TABLE IF EXISTS postal_codes;
|
|
CREATE TABLE postal_codes (
|
|
id integer primary key,
|
|
country_code text,
|
|
name text,
|
|
language text,
|
|
place_type text,
|
|
parent_id integer
|
|
);
|
|
|
|
.import $GEOPLANET_POSTAL_CODES_FILE postal_codes
|
|
CREATE INDEX postal_codes_parent_id_index on postal_codes(parent_id);
|
|
CREATE INDEX postal_codes_country_code on postal_codes(country_code);
|
|
|
|
DROP TABLE IF EXISTS admins;
|
|
CREATE TABLE admins (
|
|
id integer primary key,
|
|
country_code text,
|
|
state_id integer,
|
|
county_id integer,
|
|
local_admin_id integer,
|
|
country_id integer,
|
|
continent_id integer
|
|
);
|
|
|
|
.import $GEOPLANET_ADMINS_FILE admins
|
|
|
|
CREATE INDEX admin_country_code on admins(country_code);
|
|
CREATE INDEX admin_state_id on admins(state_id);
|
|
CREATE INDEX admin_county_id on admins(county_id);
|
|
CREATE INDEX admin_local_admin_id on admins(local_admin_id);
|
|
CREATE INDEX admin_country_id on admins(country_id);
|
|
CREATE INDEX admin_continent_id on admins(continent_id);
|
|
|
|
DROP TABLE IF EXISTS aliases;
|
|
CREATE TABLE aliases (
|
|
id integer,
|
|
name text,
|
|
name_type text,
|
|
language text
|
|
);
|
|
|
|
.import $GEOPLANET_ALIASES_FILE aliases
|
|
|
|
CREATE INDEX alias_id on aliases(id);
|
|
|
|
DROP TABLE IF EXISTS geonames_concordance;
|
|
CREATE TABLE geonames_concordance (
|
|
id integer primary key,
|
|
geonames_id integer,
|
|
name text,
|
|
lat number,
|
|
lon number
|
|
);
|
|
|
|
.mode csv
|
|
.import $GEOPLANET_GEONAMES_CONCORDANCE_FILE geonames_concordance
|
|
|
|
CREATE INDEX geonames_concordance_geonames_id on geonames_concordance(geonames_id);
|
|
|
|
" | sqlite3 $OUT_DIR/$GEOPLANET_DB_FILE |