Initial fork commit
This commit is contained in:
0
scripts/geodata/whosonfirst/__init__.py
Normal file
0
scripts/geodata/whosonfirst/__init__.py
Normal file
55
scripts/geodata/whosonfirst/client.py
Normal file
55
scripts/geodata/whosonfirst/client.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import boto3
|
||||
import os
|
||||
import six
|
||||
import ujson as json
|
||||
|
||||
from geodata.encoding import safe_encode
|
||||
from geodata.file_utils import ensure_dir
|
||||
|
||||
|
||||
class WhosOnFirst(object):
|
||||
WOF_S3_BUCKET = 'whosonfirst.mapzen.com'
|
||||
|
||||
def __init__(self, wof_dir, **s3_args):
|
||||
self.s3 = boto3.client('s3')
|
||||
self.wof_dir = wof_dir
|
||||
|
||||
@classmethod
|
||||
def path_and_filename(cls, wof_id):
|
||||
id_str = safe_encode(wof_id)
|
||||
n = 3
|
||||
parts = [id_str[i:i + n] for i in six.moves.xrange(0, len(id_str), n)]
|
||||
filename = six.u('{}.geojson').format(wof_id)
|
||||
return six.u('/').join(parts), filename
|
||||
|
||||
def local_path(self, wof_id):
|
||||
s3_path, filename = self.path_and_filename(wof_id)
|
||||
local_path = s3_path
|
||||
if os.sep != six.u('/'):
|
||||
local_path = s3_path.replace(six.u('/'), os.sep)
|
||||
return os.path.join(self.wof_dir, local_path, filename)
|
||||
|
||||
def exists_locally(self, wof_id):
|
||||
local_path = self.local_path(wof_id)
|
||||
return os.path.exists(local_path)
|
||||
|
||||
def load(self, wof_id):
|
||||
local_path = self.local_path(wof_id)
|
||||
return json.load(open(local_path))
|
||||
|
||||
def download_file(self, wof_id):
|
||||
s3_path, filename = self.path_and_filename(wof_id)
|
||||
|
||||
local_path = self.local_path(wof_id)
|
||||
local_dir = os.path.dirname(local_path)
|
||||
|
||||
s3_key = six.u('/').join(('data', s3_path, filename))
|
||||
try:
|
||||
bucket = self.WOF_S3_BUCKET
|
||||
self.s3.head_object(Bucket=bucket, Key=s3_key)
|
||||
ensure_dir(local_dir)
|
||||
if not os.path.exists(local_path):
|
||||
self.s3.download_file(self.WOF_S3_BUCKET, s3_key, local_path)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
74
scripts/geodata/whosonfirst/crawl.py
Normal file
74
scripts/geodata/whosonfirst/crawl.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import gevent
|
||||
import gevent.pool
|
||||
|
||||
import os
|
||||
import six
|
||||
import ujson as json
|
||||
|
||||
from geodata.whosonfirst.client import WhosOnFirst
|
||||
from geodata.encoding import safe_encode
|
||||
from geodata.file_utils import ensure_dir
|
||||
|
||||
|
||||
class WhosOnFirstCrawler(object):
|
||||
def __init__(self, wof_dir, cache_size=10000, **s3_args):
|
||||
self.wof_dir = wof_dir
|
||||
self.admin_dir = os.path.join(wof_dir, 'admin')
|
||||
ensure_dir(self.admin_dir)
|
||||
self.client = WhosOnFirst(self.admin_dir, **s3_args)
|
||||
|
||||
def walk_files(self, base_dir):
|
||||
for root, dirs, files in os.walk(os.path.join(base_dir, 'data')):
|
||||
if not files:
|
||||
continue
|
||||
for filename in files:
|
||||
yield os.path.join(root, filename)
|
||||
|
||||
def download_dependencies(self, path):
|
||||
data = json.load(open(path))
|
||||
props = data['properties']
|
||||
|
||||
_, filename = os.path.split(path)
|
||||
current_wof_id = filename.rsplit('.geojson', 1)[0]
|
||||
|
||||
for hierarchy in props.get('wof:hierarchy', []):
|
||||
for key, wof_id in six.iteritems(hierarchy):
|
||||
wof_id = safe_encode(wof_id)
|
||||
|
||||
if wof_id != current_wof_id and wof_id != '-1' and not self.client.exists_locally(wof_id):
|
||||
if not self.client.download_file(wof_id):
|
||||
print('error downloading {}'.format(wof_id))
|
||||
continue
|
||||
return props.get('name')
|
||||
|
||||
def data_and_dependencies(self, path):
|
||||
data = json.load(open(path))
|
||||
props = data['properties']
|
||||
|
||||
_, filename = os.path.split(path)
|
||||
current_wof_id = filename.rsplit('.geojson', 1)[0]
|
||||
|
||||
dependencies = {}
|
||||
|
||||
for hierarchy in props.get('wof:hierarchy', []):
|
||||
for key, wof_id in six.iteritems(hierarchy):
|
||||
wof_id = safe_encode(wof_id)
|
||||
if wof_id in dependencies or wof_id == current_wof_id:
|
||||
continue
|
||||
|
||||
if not self.client.exists_locally(wof_id):
|
||||
continue
|
||||
|
||||
value = self.client.load(wof_id)
|
||||
|
||||
# Only include properties, not all the polygon data
|
||||
dependencies[wof_id] = value.get('properties', {})
|
||||
|
||||
return data, dependencies
|
||||
|
||||
def load(self, repo_dir):
|
||||
return (self.data_and_dependencies(filename) for filename in self.walk_files(repo_dir))
|
||||
|
||||
def crawl(self, repo_dir, workers=10):
|
||||
workers = gevent.pool.Pool(workers)
|
||||
return workers.imap_unordered(self.download_dependencies, self.walk_files(repo_dir))
|
||||
27
scripts/geodata/whosonfirst/download_wof_admin_polygon.py
Normal file
27
scripts/geodata/whosonfirst/download_wof_admin_polygon.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import os
|
||||
import pycountry
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
|
||||
WOF_DATA_ADMIN_REPO_URL_PREFIX = "https://github.com/whosonfirst-data/whosonfirst-data/"
|
||||
WOF_DATA_ADMIN_REPO_PREFIX = "whosonfirst-data-admin-"
|
||||
|
||||
|
||||
def download_wof_data_admin(wof_dir):
|
||||
for country_object in pycountry.countries:
|
||||
repo_name = WOF_DATA_ADMIN_REPO_PREFIX + country_object.alpha2.lower()
|
||||
repo_location = os.path.join(wof_dir, repo_name)
|
||||
if not os.path.exists(repo_location):
|
||||
subprocess.call(["git", "clone", WOF_DATA_ADMIN_REPO_URL_PREFIX + repo_name])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 2:
|
||||
sys.exit('Usage: python download_whosonfirst_data.py wof_dir')
|
||||
|
||||
download_wof_data_admin(sys.argv[1])
|
||||
51
scripts/geodata/whosonfirst/download_wof_postal_codes.py
Normal file
51
scripts/geodata/whosonfirst/download_wof_postal_codes.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import os
|
||||
import requests
|
||||
import subprocess
|
||||
import sys
|
||||
import ujson as json
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.file_utils import ensure_dir
|
||||
|
||||
|
||||
WOF_PLACE_DATA_REPO = 'https://github.com/whosonfirst-data/whosonfirst-data'
|
||||
SEED_URLS_JSON = 'https://raw.githubusercontent.com/whosonfirst-data/whosonfirst-data-postalcode/master/data.json'
|
||||
|
||||
|
||||
def clone_repo(wof_dir, repo):
|
||||
repo_name = repo.rstrip('/').rsplit('/', 1)[-1]
|
||||
repo_dir = os.path.join(wof_dir, repo_name)
|
||||
|
||||
subprocess.check_call(['rm', '-rf', repo_dir])
|
||||
subprocess.check_call(['git', 'clone', repo, repo_dir])
|
||||
|
||||
return repo_dir
|
||||
|
||||
|
||||
def download_wof_postcodes(wof_dir):
|
||||
ensure_dir(wof_dir)
|
||||
|
||||
clone_repo(wof_dir, WOF_PLACE_DATA_REPO)
|
||||
|
||||
response = requests.get(SEED_URLS_JSON)
|
||||
if response.ok:
|
||||
content = json.loads(response.content)
|
||||
|
||||
for d in content:
|
||||
repo_name = d['name']
|
||||
|
||||
if int(d.get('count', 0)) > 0:
|
||||
repo = d['url']
|
||||
print('doing {}'.format(repo_name))
|
||||
|
||||
repo_dir = clone_repo(wof_dir, repo)
|
||||
|
||||
else:
|
||||
print('skipping {}'.format(repo_name))
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 2:
|
||||
sys.exit('Usage: python download_wof_postal_codes.py wof_base_dir')
|
||||
download_wof_postcodes(sys.argv[1])
|
||||
Reference in New Issue
Block a user