diff --git a/scripts/geodata/whosonfirst/download_wof_postal_codes.py b/scripts/geodata/whosonfirst/download_wof_postal_codes.py new file mode 100644 index 00000000..e5549364 --- /dev/null +++ b/scripts/geodata/whosonfirst/download_wof_postal_codes.py @@ -0,0 +1,56 @@ +import gevent +from gevent import monkey +monkey.patch_all() + +import os +import requests +import subprocess +import sys +import ujson as json + +this_dir = os.path.realpath(os.path.dirname(__file__)) +sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) + +from geodata.whosonfirst.crawl import WhosOnFirstCrawler +from geodata.file_utils import ensure_dir + +SEED_URLS_JSON = 'https://raw.githubusercontent.com/whosonfirst-data/whosonfirst-data-postalcode/master/data.json' + + +def clone_repo(wof_dir, repo): + repo_name = repo.rstrip('/').rsplit('/', 1)[-1] + repo_dir = os.path.join(wof_dir, repo_name) + + subprocess.check_call(['rm', '-rf', repo_dir]) + subprocess.check_call(['git', 'clone', repo, repo_dir]) + + return repo_dir + + +def download_wof_postcodes(wof_dir): + ensure_dir(wof_dir) + crawler = WhosOnFirstCrawler(wof_dir) + + response = requests.get(SEED_URLS_JSON) + if response.ok: + content = json.loads(response.content) + + for d in content: + repo_name = d['name'] + + if int(d.get('count', 0)) > 0: + repo = d['url'] + print('doing {}'.format(repo_name)) + + repo_dir = clone_repo(wof_dir, repo) + + for i, postcode in enumerate(crawler.crawl(repo_dir)): + if i % 100 == 0 and i > 0: + print('downloaded {} postcodes from WoF'.format(i)) + else: + print('skipping {}'.format(repo_name)) + +if __name__ == '__main__': + if len(sys.argv) < 2: + sys.exit('Usage: python download_wof_postal_codes.py wof_base_dir') + download_wof_postcodes(sys.argv[1])