diff --git a/scripts/geodata/openaddresses/config.py b/scripts/geodata/openaddresses/config.py new file mode 100644 index 00000000..9a9eaaba --- /dev/null +++ b/scripts/geodata/openaddresses/config.py @@ -0,0 +1,33 @@ +import os +import six +import yaml + +this_dir = os.path.realpath(os.path.dirname(__file__)) + +OPENADDRESSES_PARSER_DATA_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, + 'resources', 'parser', 'data_sets', 'openaddresses.yaml') + + +class OpenAddressesConfig(object): + def __init__(self, path=OPENADDRESSES_PARSER_DATA_CONFIG): + self.path = path + + config = yaml.load(open(path)) + self.config = config['global'] + self.country_configs = config['countries'] + + @property + def sources(self): + for country, config in six.iteritems(self.country_configs): + for file_config in config.get('files', []): + filename = file_config['filename'].rsplit('.', 1)[0] + + yield country, filename + + for subdir, subdir_config in six.iteritems(config.get('subdirs', {})): + for file_config in subdir_config.get('files', []): + filename = file_config['filename'].rsplit('.', 1)[0] + + yield country, subdir, filename + +openaddresses_config = OpenAddressesConfig() diff --git a/scripts/geodata/openaddresses/download_openaddresses.py b/scripts/geodata/openaddresses/download_openaddresses.py new file mode 100644 index 00000000..21b98a03 --- /dev/null +++ b/scripts/geodata/openaddresses/download_openaddresses.py @@ -0,0 +1,46 @@ +import os +import six +import sys +import requests +import subprocess +import yaml + +from six.moves.urllib_parse import urljoin + +this_dir = os.path.realpath(os.path.dirname(__file__)) +sys.path.append(os.path.realpath(os.path.join(os.pardir, os.pardir))) + +from geodata.openaddresses.formatter import OPENADDRESSES_PARSER_DATA_CONFIG +from geodata.file_utils import ensure_dir, download_file, cd, remove_file + +BASE_OPENADDRESSES_DATA_URL = 'http://results.openaddresses.io' +OPENADDRESSES_LATEST_URL = BASE_OPENADDRESSES_URL + '/latest/run/' + +OPENADDRESSES_EXTENSION = '.zip' + + +def main(out_dir): + ensure_dir(out_dir) + + config = yaml.load(open(OPENADDRESSES_PARSER_DATA_CONFIG)) + + with cd(out_dir): + for path in openaddresses_config.sources: + source = '/'.join(path) + zip_file = path[-1] + OPENADDRESSES_EXTENSION + zip_url = source + OPENADDRESSES_EXTENSION + url = OPENADDRESSES_LATEST_URL + zip_url + + zip_path = os.path.join(out_dir, zip_file) + + print('downloading: {}', source) + if download_file(url, zip_path): + subprocess.check_call(['unzip', zip_path]) + remove_file(zip_path) + + +if __name__ == '__main__': + if len(sys.argv) < 2: + print('Usage: python download_openaddresses.py out_dir') + sys.exit(1) + main(sys.argv[1]) diff --git a/scripts/geodata/openaddresses/fetch_openaddresses.sh b/scripts/geodata/openaddresses/fetch_openaddresses.sh deleted file mode 100644 index 76c68da9..00000000 --- a/scripts/geodata/openaddresses/fetch_openaddresses.sh +++ /dev/null @@ -1,2 +0,0 @@ -wget http://s3.amazonaws.com/data.openaddresses.io/openaddr-collected-global.zip -unzip openaddr-collected-global.zip \ No newline at end of file diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index 5b96cbb7..f45e25dc 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -18,16 +18,12 @@ from geodata.countries.names import country_names from geodata.encoding import safe_decode, safe_encode from geodata.language_id.disambiguation import UNKNOWN_LANGUAGE from geodata.math.sampling import cdf, weighted_choice +from geodata.openaddresses.config import openaddresses_config from geodata.places.config import place_config from geodata.text.utils import is_numeric, is_numeric_strict from geodata.csv_utils import tsv_string, unicode_csv_reader -this_dir = os.path.realpath(os.path.dirname(__file__)) - -OPENADDRESSES_PARSER_DATA_CONFIG = os.path.join(this_dir, os.pardir, os.pardir, os.pardir, - 'resources', 'parser', 'data_sets', 'openaddresses.yaml') - OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME = 'openaddresses_formatted_addresses_tagged.tsv' OPENADDRESSES_FORMAT_DATA_FILENAME = 'openaddresses_formatted_addresses.tsv' @@ -69,10 +65,6 @@ class OpenAddressesFormatter(object): self.components = components self.language_rtree = components.language_rtree - config = yaml.load(open(OPENADDRESSES_PARSER_DATA_CONFIG)) - self.config = config['global'] - self.country_configs = config['countries'] - self.debug = debug self.formatter = AddressFormatter() @@ -414,14 +406,14 @@ class OpenAddressesFormatter(object): i = 0 - for country_dir, config in six.iteritems(self.country_configs): + for country_dir, config in six.iteritems(openaddresses_config.country_configs): for file_config in config.get('files', []): filename = file_config['filename'] print(six.u('doing {}/{}').format(country_dir, filename)) path = os.path.join(base_dir, country_dir, filename) - configs = (file_config, config, self.config) + configs = (file_config, config, openaddresses_config.config) for language, country, formatted_address in self.formatted_addresses(path, configs, tag_components=tag_components): if not formatted_address or not formatted_address.strip(): continue