From d0679294bfc7fb87fcfb94930f8da90e9d7e1d38 Mon Sep 17 00:00:00 2001 From: Al Date: Fri, 24 Feb 2017 03:39:21 -0500 Subject: [PATCH] [openaddresses] adding positional args so OpenAddresses ingestion can be run only for specific countries, subdirs, or individual files. --- scripts/geodata/openaddresses/formatter.py | 32 +++++++++++++++---- .../openaddresses_training_data.py | 4 ++- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index 69d54f1d..4474dbee 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -62,18 +62,18 @@ class OpenAddressesFormatter(object): field_regex_replacements = { # All fields None: [ - (re.compile('<\s*null\s*>', re.I), six.u('')), + (re.compile('<\s*null\s*>', re.I), u''), (re.compile('[\s]{2,}'), six.u(' ')), - (re.compile('\`'), six.u("'")), - (re.compile('\-?\*'), six.u("")), + (re.compile('\`'), u"'"), + (re.compile('\-?\*'), u""), ], AddressFormatter.HOUSE_NUMBER: [ # Most of the house numbers in Montreal start with "#" - (re.compile('^#', re.UNICODE), six.u('')), + (re.compile('^#', re.UNICODE), u''), # Some house numbers have multiple hyphens - (re.compile('[\-]{2,}'), six.u('-')), + (re.compile('[\-]{2,}'), u'-'), # Some house number ranges are split up like "12 -14" - (re.compile('[\s]*\-[\s]*'), six.u('-')), + (re.compile('[\s]*\-[\s]*'), u'-'), ] } @@ -590,7 +590,19 @@ class OpenAddressesFormatter(object): minimal_only=False, tag_components=tag_components) yield (language, country, formatted) - def build_training_data(self, base_dir, out_dir, tag_components=True): + def build_training_data(self, base_dir, out_dir, tag_components=True, sources_only=None): + all_sources_valid = sources_only is None + valid_sources = set() + if not all_sources_valid: + for source in sources_only: + if source.startswith(base_dir): + source = os.path.relpath(source, base_dir) + + parts = source.strip('/ ').split('/') + if len(parts) > 3: + raise AssertionError('Sources may only have at maximum 3 parts') + valid_sources.add(tuple(parts)) + if tag_components: formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME), 'w') writer = csv.writer(formatted_tagged_file, 'tsv_no_quote') @@ -608,6 +620,9 @@ class OpenAddressesFormatter(object): for file_config in country_config.get('files', []): filename = file_config['filename'] + if not all_sources_valid and not ((country_dir, filename) in valid_sources or (country_dir,) in valid_sources): + continue + print(six.u('doing {}/{}').format(country_dir, filename)) path = os.path.join(base_dir, country_dir, filename) @@ -638,6 +653,9 @@ class OpenAddressesFormatter(object): for file_config in subdir_config.get('files', []): filename = file_config['filename'] + if not all_sources_valid and not ((country_dir, subdir, filename) in valid_sources or (country_dir, subdir) in valid_sources or (country_dir,) in valid_sources): + continue + print(six.u('doing {}/{}/{}').format(country_dir, subdir, filename)) path = os.path.join(base_dir, country_dir, subdir, filename) diff --git a/scripts/geodata/openaddresses/openaddresses_training_data.py b/scripts/geodata/openaddresses/openaddresses_training_data.py index 144d682b..b4f15b52 100644 --- a/scripts/geodata/openaddresses/openaddresses_training_data.py +++ b/scripts/geodata/openaddresses/openaddresses_training_data.py @@ -30,6 +30,8 @@ if __name__ == '__main__': # Handle argument parsing here parser = argparse.ArgumentParser() + parser.add_argument('sources', nargs='*') + parser.add_argument('-i', '--openaddresses-dir', help='Path to OpenAddresses directory') @@ -88,4 +90,4 @@ if __name__ == '__main__': components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index) oa_formatter = OpenAddressesFormatter(components, country_rtree, debug=args.debug) - oa_formatter.build_training_data(args.openaddresses_dir, args.out_dir, tag_components=not args.untagged) + oa_formatter.build_training_data(args.openaddresses_dir, args.out_dir, tag_components=not args.untagged, sources_only=args.sources or None)