[openaddresses] adding positional args so OpenAddresses ingestion can be run only for specific countries, subdirs, or individual files.
This commit is contained in:
@@ -62,18 +62,18 @@ class OpenAddressesFormatter(object):
|
|||||||
field_regex_replacements = {
|
field_regex_replacements = {
|
||||||
# All fields
|
# All fields
|
||||||
None: [
|
None: [
|
||||||
(re.compile('<\s*null\s*>', re.I), six.u('')),
|
(re.compile('<\s*null\s*>', re.I), u''),
|
||||||
(re.compile('[\s]{2,}'), six.u(' ')),
|
(re.compile('[\s]{2,}'), six.u(' ')),
|
||||||
(re.compile('\`'), six.u("'")),
|
(re.compile('\`'), u"'"),
|
||||||
(re.compile('\-?\*'), six.u("")),
|
(re.compile('\-?\*'), u""),
|
||||||
],
|
],
|
||||||
AddressFormatter.HOUSE_NUMBER: [
|
AddressFormatter.HOUSE_NUMBER: [
|
||||||
# Most of the house numbers in Montreal start with "#"
|
# Most of the house numbers in Montreal start with "#"
|
||||||
(re.compile('^#', re.UNICODE), six.u('')),
|
(re.compile('^#', re.UNICODE), u''),
|
||||||
# Some house numbers have multiple hyphens
|
# Some house numbers have multiple hyphens
|
||||||
(re.compile('[\-]{2,}'), six.u('-')),
|
(re.compile('[\-]{2,}'), u'-'),
|
||||||
# Some house number ranges are split up like "12 -14"
|
# Some house number ranges are split up like "12 -14"
|
||||||
(re.compile('[\s]*\-[\s]*'), six.u('-')),
|
(re.compile('[\s]*\-[\s]*'), u'-'),
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -590,7 +590,19 @@ class OpenAddressesFormatter(object):
|
|||||||
minimal_only=False, tag_components=tag_components)
|
minimal_only=False, tag_components=tag_components)
|
||||||
yield (language, country, formatted)
|
yield (language, country, formatted)
|
||||||
|
|
||||||
def build_training_data(self, base_dir, out_dir, tag_components=True):
|
def build_training_data(self, base_dir, out_dir, tag_components=True, sources_only=None):
|
||||||
|
all_sources_valid = sources_only is None
|
||||||
|
valid_sources = set()
|
||||||
|
if not all_sources_valid:
|
||||||
|
for source in sources_only:
|
||||||
|
if source.startswith(base_dir):
|
||||||
|
source = os.path.relpath(source, base_dir)
|
||||||
|
|
||||||
|
parts = source.strip('/ ').split('/')
|
||||||
|
if len(parts) > 3:
|
||||||
|
raise AssertionError('Sources may only have at maximum 3 parts')
|
||||||
|
valid_sources.add(tuple(parts))
|
||||||
|
|
||||||
if tag_components:
|
if tag_components:
|
||||||
formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME), 'w')
|
formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME), 'w')
|
||||||
writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
|
writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
|
||||||
@@ -608,6 +620,9 @@ class OpenAddressesFormatter(object):
|
|||||||
for file_config in country_config.get('files', []):
|
for file_config in country_config.get('files', []):
|
||||||
filename = file_config['filename']
|
filename = file_config['filename']
|
||||||
|
|
||||||
|
if not all_sources_valid and not ((country_dir, filename) in valid_sources or (country_dir,) in valid_sources):
|
||||||
|
continue
|
||||||
|
|
||||||
print(six.u('doing {}/{}').format(country_dir, filename))
|
print(six.u('doing {}/{}').format(country_dir, filename))
|
||||||
|
|
||||||
path = os.path.join(base_dir, country_dir, filename)
|
path = os.path.join(base_dir, country_dir, filename)
|
||||||
@@ -638,6 +653,9 @@ class OpenAddressesFormatter(object):
|
|||||||
for file_config in subdir_config.get('files', []):
|
for file_config in subdir_config.get('files', []):
|
||||||
filename = file_config['filename']
|
filename = file_config['filename']
|
||||||
|
|
||||||
|
if not all_sources_valid and not ((country_dir, subdir, filename) in valid_sources or (country_dir, subdir) in valid_sources or (country_dir,) in valid_sources):
|
||||||
|
continue
|
||||||
|
|
||||||
print(six.u('doing {}/{}/{}').format(country_dir, subdir, filename))
|
print(six.u('doing {}/{}/{}').format(country_dir, subdir, filename))
|
||||||
|
|
||||||
path = os.path.join(base_dir, country_dir, subdir, filename)
|
path = os.path.join(base_dir, country_dir, subdir, filename)
|
||||||
|
|||||||
@@ -30,6 +30,8 @@ if __name__ == '__main__':
|
|||||||
# Handle argument parsing here
|
# Handle argument parsing here
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument('sources', nargs='*')
|
||||||
|
|
||||||
parser.add_argument('-i', '--openaddresses-dir',
|
parser.add_argument('-i', '--openaddresses-dir',
|
||||||
help='Path to OpenAddresses directory')
|
help='Path to OpenAddresses directory')
|
||||||
|
|
||||||
@@ -88,4 +90,4 @@ if __name__ == '__main__':
|
|||||||
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
|
components = AddressComponents(osm_rtree, neighborhoods_rtree, places_index)
|
||||||
|
|
||||||
oa_formatter = OpenAddressesFormatter(components, country_rtree, debug=args.debug)
|
oa_formatter = OpenAddressesFormatter(components, country_rtree, debug=args.debug)
|
||||||
oa_formatter.build_training_data(args.openaddresses_dir, args.out_dir, tag_components=not args.untagged)
|
oa_formatter.build_training_data(args.openaddresses_dir, args.out_dir, tag_components=not args.untagged, sources_only=args.sources or None)
|
||||||
|
|||||||
Reference in New Issue
Block a user