libpostal/scripts/geodata/whosonfirst/download_wof_postal_codes.py

import gevent
from gevent import monkey
monkey.patch_all()

import os
import requests
import subprocess
import sys
import ujson as json

this_dir = os.path.realpath(os.path.dirname(__file__))
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))

from geodata.whosonfirst.crawl import WhosOnFirstCrawler
from geodata.file_utils import ensure_dir

SEED_URLS_JSON = 'https://raw.githubusercontent.com/whosonfirst-data/whosonfirst-data-postalcode/master/data.json'


def clone_repo(wof_dir, repo):
    repo_name = repo.rstrip('/').rsplit('/', 1)[-1]
    repo_dir = os.path.join(wof_dir, repo_name)

    subprocess.check_call(['rm', '-rf', repo_dir])
    subprocess.check_call(['git', 'clone', repo, repo_dir])

    return repo_dir


def download_wof_postcodes(wof_dir):
    ensure_dir(wof_dir)
    crawler = WhosOnFirstCrawler(wof_dir)

    response = requests.get(SEED_URLS_JSON)
    if response.ok:
        content = json.loads(response.content)

        for d in content:
            repo_name = d['name']

            if int(d.get('count', 0)) > 0:
                repo = d['url']
                print('doing {}'.format(repo_name))

                repo_dir = clone_repo(wof_dir, repo)

                for i, postcode in enumerate(crawler.crawl(repo_dir)):
                    if i % 100 == 0 and i > 0:
                        print('downloaded {} postcodes from WoF'.format(i))
            else:
                print('skipping {}'.format(repo_name))

if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.exit('Usage: python download_wof_postal_codes.py wof_base_dir')
    download_wof_postcodes(sys.argv[1])