[openaddresses] adding script option to download all completed OA files instead of just what's in the config
This commit is contained in:
@@ -1,8 +1,10 @@
|
||||
import argparse
|
||||
import os
|
||||
import six
|
||||
import sys
|
||||
import requests
|
||||
import six
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import yaml
|
||||
|
||||
from six.moves.urllib_parse import urljoin
|
||||
@@ -14,31 +16,78 @@ from geodata.openaddresses.config import openaddresses_config
|
||||
from geodata.file_utils import ensure_dir, download_file, cd, remove_file
|
||||
|
||||
BASE_OPENADDRESSES_DATA_URL = 'http://results.openaddresses.io'
|
||||
OPENADDRESSES_LATEST_URL = BASE_OPENADDRESSES_DATA_URL + '/latest/run/'
|
||||
|
||||
OPENADDRESSES_EXTENSION = '.zip'
|
||||
OPENADDRESSES_LATEST_DIR = urljoin(BASE_OPENADDRESSES_DATA_URL, 'latest/run/')
|
||||
|
||||
OPENADDRESSES_STATE_FILE_NAME = 'state.txt'
|
||||
OPENADDRESSES_STATE_URL = urljoin(BASE_OPENADDRESSES_DATA_URL, OPENADDRESSES_STATE_FILE_NAME)
|
||||
|
||||
|
||||
def main(out_dir):
|
||||
ensure_dir(out_dir)
|
||||
def download_and_unzip_file(url, out_dir):
|
||||
zip_filename = url.rsplit('/', 1)[-1].strip()
|
||||
zip_local_path = os.path.join(out_dir, zip_filename)
|
||||
|
||||
with cd(out_dir):
|
||||
for path in openaddresses_config.sources:
|
||||
source = '/'.join(path)
|
||||
zip_file = path[-1] + OPENADDRESSES_EXTENSION
|
||||
zip_url = source + OPENADDRESSES_EXTENSION
|
||||
url = OPENADDRESSES_LATEST_URL + zip_url
|
||||
success = download_file(url, zip_local_path) and subprocess.check_call(['unzip', '-o', zip_local_path, '-d', OPENADDRESSES_DIR]) == 0
|
||||
|
||||
zip_path = os.path.join(out_dir, zip_file)
|
||||
if os.path.exists(zip_local_path):
|
||||
remove_file(zip_local_path)
|
||||
|
||||
print('downloading: {}'.format(source))
|
||||
if download_file(url, zip_path):
|
||||
subprocess.check_call(['unzip', '-o', zip_path])
|
||||
remove_file(zip_path)
|
||||
return success
|
||||
|
||||
|
||||
def openaddresses_download_all_files(out_dir):
|
||||
temp_dir = tempfile.gettempdir()
|
||||
|
||||
local_state_file_path = os.path.join(temp_dir, OPENADDRESSES_STATE_FILE_NAME)
|
||||
if not download_file(OPENADDRESSES_STATE_URL, local_state_file_path):
|
||||
sys.exit('Could not download state.txt file')
|
||||
|
||||
reader = unicode_csv_reader(local_state_file_path, delimiter='\t')
|
||||
|
||||
headers = reader.next()
|
||||
source_index = headers.index('source')
|
||||
url_index = headers.index('processed')
|
||||
|
||||
for row in reader:
|
||||
source = row[source_index].rsplit('.')[0]
|
||||
processed = row[url_index]
|
||||
if not processed or not processed.strip():
|
||||
continue
|
||||
|
||||
print(six.u('doing {}').format(source))
|
||||
success = download_and_unzip_file(processed, out_dir)
|
||||
if not success:
|
||||
print(six.u('ERR: could not download {}').format(source))
|
||||
|
||||
remove_file(local_state_file_path)
|
||||
|
||||
|
||||
def openaddresses_download_configured_files(out_dir):
|
||||
for path in openaddresses_config.sources:
|
||||
source = '/'.join(path)
|
||||
zip_path = source + '.zip'
|
||||
url = urljoin(OPENADDRESSES_LATEST_DIR, zip_path)
|
||||
|
||||
print(six.u('doing {}').format(source))
|
||||
success = download_and_unzip_file(url, out_dir)
|
||||
if not success:
|
||||
print(six.u('ERR: could not download {}').format(source))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 2:
|
||||
print('Usage: python download_openaddresses.py out_dir')
|
||||
sys.exit(1)
|
||||
main(sys.argv[1])
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-o', '--out-dir',
|
||||
required=True,
|
||||
help='Output directory')
|
||||
|
||||
parser.add_argument('--all', action='store_true',
|
||||
default=False, help='Download all completed OpenAddresses files')
|
||||
|
||||
args = parser.parse_args()
|
||||
ensure_dir(args.out_dir)
|
||||
|
||||
if args.all:
|
||||
openaddresses_download_all_files(args.out_dir)
|
||||
else:
|
||||
openaddresses_download_configured_files(args.out_dir)
|
||||
|
||||
Reference in New Issue
Block a user