[openaddresses] Using a download script to download the individual OA files of interest rather than the collected file with expansions applied
This commit is contained in:
33
scripts/geodata/openaddresses/config.py
Normal file
33
scripts/geodata/openaddresses/config.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import os
|
||||
import six
|
||||
import yaml
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
OPENADDRESSES_PARSER_DATA_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'parser', 'data_sets', 'openaddresses.yaml')
|
||||
|
||||
|
||||
class OpenAddressesConfig(object):
|
||||
def __init__(self, path=OPENADDRESSES_PARSER_DATA_CONFIG):
|
||||
self.path = path
|
||||
|
||||
config = yaml.load(open(path))
|
||||
self.config = config['global']
|
||||
self.country_configs = config['countries']
|
||||
|
||||
@property
|
||||
def sources(self):
|
||||
for country, config in six.iteritems(self.country_configs):
|
||||
for file_config in config.get('files', []):
|
||||
filename = file_config['filename'].rsplit('.', 1)[0]
|
||||
|
||||
yield country, filename
|
||||
|
||||
for subdir, subdir_config in six.iteritems(config.get('subdirs', {})):
|
||||
for file_config in subdir_config.get('files', []):
|
||||
filename = file_config['filename'].rsplit('.', 1)[0]
|
||||
|
||||
yield country, subdir, filename
|
||||
|
||||
openaddresses_config = OpenAddressesConfig()
|
||||
46
scripts/geodata/openaddresses/download_openaddresses.py
Normal file
46
scripts/geodata/openaddresses/download_openaddresses.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import os
|
||||
import six
|
||||
import sys
|
||||
import requests
|
||||
import subprocess
|
||||
import yaml
|
||||
|
||||
from six.moves.urllib_parse import urljoin
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir)))
|
||||
|
||||
from geodata.openaddresses.formatter import OPENADDRESSES_PARSER_DATA_CONFIG
|
||||
from geodata.file_utils import ensure_dir, download_file, cd, remove_file
|
||||
|
||||
BASE_OPENADDRESSES_DATA_URL = 'http://results.openaddresses.io'
|
||||
OPENADDRESSES_LATEST_URL = BASE_OPENADDRESSES_URL + '/latest/run/'
|
||||
|
||||
OPENADDRESSES_EXTENSION = '.zip'
|
||||
|
||||
|
||||
def main(out_dir):
|
||||
ensure_dir(out_dir)
|
||||
|
||||
config = yaml.load(open(OPENADDRESSES_PARSER_DATA_CONFIG))
|
||||
|
||||
with cd(out_dir):
|
||||
for path in openaddresses_config.sources:
|
||||
source = '/'.join(path)
|
||||
zip_file = path[-1] + OPENADDRESSES_EXTENSION
|
||||
zip_url = source + OPENADDRESSES_EXTENSION
|
||||
url = OPENADDRESSES_LATEST_URL + zip_url
|
||||
|
||||
zip_path = os.path.join(out_dir, zip_file)
|
||||
|
||||
print('downloading: {}', source)
|
||||
if download_file(url, zip_path):
|
||||
subprocess.check_call(['unzip', zip_path])
|
||||
remove_file(zip_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 2:
|
||||
print('Usage: python download_openaddresses.py out_dir')
|
||||
sys.exit(1)
|
||||
main(sys.argv[1])
|
||||
@@ -1,2 +0,0 @@
|
||||
wget http://s3.amazonaws.com/data.openaddresses.io/openaddr-collected-global.zip
|
||||
unzip openaddr-collected-global.zip
|
||||
@@ -18,16 +18,12 @@ from geodata.countries.names import country_names
|
||||
from geodata.encoding import safe_decode, safe_encode
|
||||
from geodata.language_id.disambiguation import UNKNOWN_LANGUAGE
|
||||
from geodata.math.sampling import cdf, weighted_choice
|
||||
from geodata.openaddresses.config import openaddresses_config
|
||||
from geodata.places.config import place_config
|
||||
from geodata.text.utils import is_numeric, is_numeric_strict
|
||||
|
||||
from geodata.csv_utils import tsv_string, unicode_csv_reader
|
||||
|
||||
this_dir = os.path.realpath(os.path.dirname(__file__))
|
||||
|
||||
OPENADDRESSES_PARSER_DATA_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir,
|
||||
'resources', 'parser', 'data_sets', 'openaddresses.yaml')
|
||||
|
||||
OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME = 'openaddresses_formatted_addresses_tagged.tsv'
|
||||
OPENADDRESSES_FORMAT_DATA_FILENAME = 'openaddresses_formatted_addresses.tsv'
|
||||
|
||||
@@ -69,10 +65,6 @@ class OpenAddressesFormatter(object):
|
||||
self.components = components
|
||||
self.language_rtree = components.language_rtree
|
||||
|
||||
config = yaml.load(open(OPENADDRESSES_PARSER_DATA_CONFIG))
|
||||
self.config = config['global']
|
||||
self.country_configs = config['countries']
|
||||
|
||||
self.debug = debug
|
||||
|
||||
self.formatter = AddressFormatter()
|
||||
@@ -414,14 +406,14 @@ class OpenAddressesFormatter(object):
|
||||
|
||||
i = 0
|
||||
|
||||
for country_dir, config in six.iteritems(self.country_configs):
|
||||
for country_dir, config in six.iteritems(openaddresses_config.country_configs):
|
||||
for file_config in config.get('files', []):
|
||||
filename = file_config['filename']
|
||||
|
||||
print(six.u('doing {}/{}').format(country_dir, filename))
|
||||
|
||||
path = os.path.join(base_dir, country_dir, filename)
|
||||
configs = (file_config, config, self.config)
|
||||
configs = (file_config, config, openaddresses_config.config)
|
||||
for language, country, formatted_address in self.formatted_addresses(path, configs, tag_components=tag_components):
|
||||
if not formatted_address or not formatted_address.strip():
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user