[openaddresses] adding script option to download all completed OA files instead of just what's in the config
This commit is contained in:
@@ -1,8 +1,10 @@
|
|||||||
|
import argparse
|
||||||
import os
|
import os
|
||||||
import six
|
|
||||||
import sys
|
|
||||||
import requests
|
import requests
|
||||||
|
import six
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from six.moves.urllib_parse import urljoin
|
from six.moves.urllib_parse import urljoin
|
||||||
@@ -14,31 +16,78 @@ from geodata.openaddresses.config import openaddresses_config
|
|||||||
from geodata.file_utils import ensure_dir, download_file, cd, remove_file
|
from geodata.file_utils import ensure_dir, download_file, cd, remove_file
|
||||||
|
|
||||||
BASE_OPENADDRESSES_DATA_URL = 'http://results.openaddresses.io'
|
BASE_OPENADDRESSES_DATA_URL = 'http://results.openaddresses.io'
|
||||||
OPENADDRESSES_LATEST_URL = BASE_OPENADDRESSES_DATA_URL + '/latest/run/'
|
|
||||||
|
|
||||||
OPENADDRESSES_EXTENSION = '.zip'
|
OPENADDRESSES_LATEST_DIR = urljoin(BASE_OPENADDRESSES_DATA_URL, 'latest/run/')
|
||||||
|
|
||||||
|
OPENADDRESSES_STATE_FILE_NAME = 'state.txt'
|
||||||
|
OPENADDRESSES_STATE_URL = urljoin(BASE_OPENADDRESSES_DATA_URL, OPENADDRESSES_STATE_FILE_NAME)
|
||||||
|
|
||||||
|
|
||||||
def main(out_dir):
|
def download_and_unzip_file(url, out_dir):
|
||||||
ensure_dir(out_dir)
|
zip_filename = url.rsplit('/', 1)[-1].strip()
|
||||||
|
zip_local_path = os.path.join(out_dir, zip_filename)
|
||||||
|
|
||||||
with cd(out_dir):
|
success = download_file(url, zip_local_path) and subprocess.check_call(['unzip', '-o', zip_local_path, '-d', OPENADDRESSES_DIR]) == 0
|
||||||
for path in openaddresses_config.sources:
|
|
||||||
source = '/'.join(path)
|
|
||||||
zip_file = path[-1] + OPENADDRESSES_EXTENSION
|
|
||||||
zip_url = source + OPENADDRESSES_EXTENSION
|
|
||||||
url = OPENADDRESSES_LATEST_URL + zip_url
|
|
||||||
|
|
||||||
zip_path = os.path.join(out_dir, zip_file)
|
if os.path.exists(zip_local_path):
|
||||||
|
remove_file(zip_local_path)
|
||||||
|
|
||||||
print('downloading: {}'.format(source))
|
return success
|
||||||
if download_file(url, zip_path):
|
|
||||||
subprocess.check_call(['unzip', '-o', zip_path])
|
|
||||||
remove_file(zip_path)
|
def openaddresses_download_all_files(out_dir):
|
||||||
|
temp_dir = tempfile.gettempdir()
|
||||||
|
|
||||||
|
local_state_file_path = os.path.join(temp_dir, OPENADDRESSES_STATE_FILE_NAME)
|
||||||
|
if not download_file(OPENADDRESSES_STATE_URL, local_state_file_path):
|
||||||
|
sys.exit('Could not download state.txt file')
|
||||||
|
|
||||||
|
reader = unicode_csv_reader(local_state_file_path, delimiter='\t')
|
||||||
|
|
||||||
|
headers = reader.next()
|
||||||
|
source_index = headers.index('source')
|
||||||
|
url_index = headers.index('processed')
|
||||||
|
|
||||||
|
for row in reader:
|
||||||
|
source = row[source_index].rsplit('.')[0]
|
||||||
|
processed = row[url_index]
|
||||||
|
if not processed or not processed.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(six.u('doing {}').format(source))
|
||||||
|
success = download_and_unzip_file(processed, out_dir)
|
||||||
|
if not success:
|
||||||
|
print(six.u('ERR: could not download {}').format(source))
|
||||||
|
|
||||||
|
remove_file(local_state_file_path)
|
||||||
|
|
||||||
|
|
||||||
|
def openaddresses_download_configured_files(out_dir):
|
||||||
|
for path in openaddresses_config.sources:
|
||||||
|
source = '/'.join(path)
|
||||||
|
zip_path = source + '.zip'
|
||||||
|
url = urljoin(OPENADDRESSES_LATEST_DIR, zip_path)
|
||||||
|
|
||||||
|
print(six.u('doing {}').format(source))
|
||||||
|
success = download_and_unzip_file(url, out_dir)
|
||||||
|
if not success:
|
||||||
|
print(six.u('ERR: could not download {}').format(source))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if len(sys.argv) < 2:
|
parser = argparse.ArgumentParser()
|
||||||
print('Usage: python download_openaddresses.py out_dir')
|
|
||||||
sys.exit(1)
|
parser.add_argument('-o', '--out-dir',
|
||||||
main(sys.argv[1])
|
required=True,
|
||||||
|
help='Output directory')
|
||||||
|
|
||||||
|
parser.add_argument('--all', action='store_true',
|
||||||
|
default=False, help='Download all completed OpenAddresses files')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
ensure_dir(args.out_dir)
|
||||||
|
|
||||||
|
if args.all:
|
||||||
|
openaddresses_download_all_files(args.out_dir)
|
||||||
|
else:
|
||||||
|
openaddresses_download_configured_files(args.out_dir)
|
||||||
|
|||||||
Reference in New Issue
Block a user