From 27c5c8536a2d3bdd53d85a253af5c193bcd8dbb9 Mon Sep 17 00:00:00 2001 From: Al Date: Sun, 28 Aug 2016 17:58:41 -0400 Subject: [PATCH] [openaddresses] adding debug argument to OpenAddresses training data --- scripts/geodata/openaddresses/formatter.py | 8 +++++++- .../geodata/openaddresses/openaddresses_training_data.py | 7 ++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index 4f64de82..00749179 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -63,7 +63,7 @@ class OpenAddressesFormatter(object): re.I | re.UNICODE) unit_type_regexes[lang] = pattern - def __init__(self, components): + def __init__(self, components, debug=False): self.components = components self.language_rtree = components.language_rtree @@ -71,6 +71,8 @@ class OpenAddressesFormatter(object): self.config = config['global'] self.country_configs = config['countries'] + self.debug = debug + self.formatter = AddressFormatter() class validators: @@ -432,6 +434,8 @@ class OpenAddressesFormatter(object): i += 1 if i % 1000 == 0 and i > 0: print('did {} formatted addresses'.format(i)) + if self.debug: + break for subdir, subdir_config in six.iteritems(config.get('subdirs', {})): for file_config in subdir_config.get('files', []): @@ -460,3 +464,5 @@ class OpenAddressesFormatter(object): i += 1 if i % 1000 == 0 and i > 0: print('did {} formatted addresses'.format(i)) + if self.debug: + break diff --git a/scripts/geodata/openaddresses/openaddresses_training_data.py b/scripts/geodata/openaddresses/openaddresses_training_data.py index afa06d26..9f9a5f1f 100644 --- a/scripts/geodata/openaddresses/openaddresses_training_data.py +++ b/scripts/geodata/openaddresses/openaddresses_training_data.py @@ -55,6 +55,11 @@ if __name__ == '__main__': default=None, help='Neighborhoods reverse geocoder RTree directory') + parser.add_argument('--debug', + action='store_true', + default=False, + help='Test on a sample of each file to debug config') + parser.add_argument('-o', '--out-dir', default=os.getcwd(), help='Output directory') @@ -83,5 +88,5 @@ if __name__ == '__main__': if args.openaddresses_dir and args.format: components = AddressComponents(osm_rtree, language_rtree, neighborhoods_rtree, quattroshapes_rtree, geonames) - oa_formatter = OpenAddressesFormatter(components) + oa_formatter = OpenAddressesFormatter(components, debug=args.debug) oa_formatter.build_training_data(args.openaddresses_dir, args.out_dir, tag_components=not args.untagged)