From 4ed362d5f820a238ce2272acd3ba3c92d7297bb3 Mon Sep 17 00:00:00 2001 From: Al Date: Wed, 31 Aug 2016 17:43:07 -0400 Subject: [PATCH] [openaddresses] adding script option to download all completed OA files instead of just what's in the config --- .../openaddresses/download_openaddresses.py | 91 ++++++++++++++----- 1 file changed, 70 insertions(+), 21 deletions(-) diff --git a/scripts/geodata/openaddresses/download_openaddresses.py b/scripts/geodata/openaddresses/download_openaddresses.py index d42290bf..3c4ee150 100644 --- a/scripts/geodata/openaddresses/download_openaddresses.py +++ b/scripts/geodata/openaddresses/download_openaddresses.py @@ -1,8 +1,10 @@ +import argparse import os -import six -import sys import requests +import six import subprocess +import sys +import tempfile import yaml from six.moves.urllib_parse import urljoin @@ -14,31 +16,78 @@ from geodata.openaddresses.config import openaddresses_config from geodata.file_utils import ensure_dir, download_file, cd, remove_file BASE_OPENADDRESSES_DATA_URL = 'http://results.openaddresses.io' -OPENADDRESSES_LATEST_URL = BASE_OPENADDRESSES_DATA_URL + '/latest/run/' -OPENADDRESSES_EXTENSION = '.zip' +OPENADDRESSES_LATEST_DIR = urljoin(BASE_OPENADDRESSES_DATA_URL, 'latest/run/') + +OPENADDRESSES_STATE_FILE_NAME = 'state.txt' +OPENADDRESSES_STATE_URL = urljoin(BASE_OPENADDRESSES_DATA_URL, OPENADDRESSES_STATE_FILE_NAME) -def main(out_dir): - ensure_dir(out_dir) +def download_and_unzip_file(url, out_dir): + zip_filename = url.rsplit('/', 1)[-1].strip() + zip_local_path = os.path.join(out_dir, zip_filename) - with cd(out_dir): - for path in openaddresses_config.sources: - source = '/'.join(path) - zip_file = path[-1] + OPENADDRESSES_EXTENSION - zip_url = source + OPENADDRESSES_EXTENSION - url = OPENADDRESSES_LATEST_URL + zip_url + success = download_file(url, zip_local_path) and subprocess.check_call(['unzip', '-o', zip_local_path, '-d', OPENADDRESSES_DIR]) == 0 - zip_path = os.path.join(out_dir, zip_file) + if os.path.exists(zip_local_path): + remove_file(zip_local_path) - print('downloading: {}'.format(source)) - if download_file(url, zip_path): - subprocess.check_call(['unzip', '-o', zip_path]) - remove_file(zip_path) + return success + + +def openaddresses_download_all_files(out_dir): + temp_dir = tempfile.gettempdir() + + local_state_file_path = os.path.join(temp_dir, OPENADDRESSES_STATE_FILE_NAME) + if not download_file(OPENADDRESSES_STATE_URL, local_state_file_path): + sys.exit('Could not download state.txt file') + + reader = unicode_csv_reader(local_state_file_path, delimiter='\t') + + headers = reader.next() + source_index = headers.index('source') + url_index = headers.index('processed') + + for row in reader: + source = row[source_index].rsplit('.')[0] + processed = row[url_index] + if not processed or not processed.strip(): + continue + + print(six.u('doing {}').format(source)) + success = download_and_unzip_file(processed, out_dir) + if not success: + print(six.u('ERR: could not download {}').format(source)) + + remove_file(local_state_file_path) + + +def openaddresses_download_configured_files(out_dir): + for path in openaddresses_config.sources: + source = '/'.join(path) + zip_path = source + '.zip' + url = urljoin(OPENADDRESSES_LATEST_DIR, zip_path) + + print(six.u('doing {}').format(source)) + success = download_and_unzip_file(url, out_dir) + if not success: + print(six.u('ERR: could not download {}').format(source)) if __name__ == '__main__': - if len(sys.argv) < 2: - print('Usage: python download_openaddresses.py out_dir') - sys.exit(1) - main(sys.argv[1]) + parser = argparse.ArgumentParser() + + parser.add_argument('-o', '--out-dir', + required=True, + help='Output directory') + + parser.add_argument('--all', action='store_true', + default=False, help='Download all completed OpenAddresses files') + + args = parser.parse_args() + ensure_dir(args.out_dir) + + if args.all: + openaddresses_download_all_files(args.out_dir) + else: + openaddresses_download_configured_files(args.out_dir)