From b50cb0cdf998dc688ae3355fa386a2044265e601 Mon Sep 17 00:00:00 2001 From: Al Date: Mon, 25 Jul 2016 00:04:44 -0400 Subject: [PATCH] [osm] add random variations of the containing components' names in building place training data. For places with small or unknown populations, use the default names of the containing components --- scripts/geodata/osm/formatter.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/scripts/geodata/osm/formatter.py b/scripts/geodata/osm/formatter.py index 61c3f8fd..5615df80 100644 --- a/scripts/geodata/osm/formatter.py +++ b/scripts/geodata/osm/formatter.py @@ -422,13 +422,15 @@ class OSMAddressFormatter(object): language_suffix = '' if name and name.strip(): - address_components = {component_name: name.strip()} - self.components.add_admin_boundaries(address_components, osm_components, country, language, - language_suffix=language_suffix) + for i in xrange(num_references): + address_components = {component_name: name.strip()} + self.components.add_admin_boundaries(address_components, osm_components, country, language, + random_key=num_references > 1, + language_suffix=language_suffix) - self.components.normalize_place_names(address_components, osm_components, country=country, languages=all_local_languages) + self.components.normalize_place_names(address_components, osm_components, country=country, languages=all_local_languages) - place_tags.append((address_components, None, True)) + place_tags.append((address_components, None, True)) for language, is_default in local_languages: if is_default and not more_than_one_official_language: @@ -441,13 +443,15 @@ class OSMAddressFormatter(object): if not name or not name.strip(): continue - address_components = {component_name: name.strip()} - self.components.add_admin_boundaries(address_components, osm_components, country, language, - language_suffix=language_suffix) + for i in xrange(num_references if is_default else 1): + address_components = {component_name: name.strip()} + self.components.add_admin_boundaries(address_components, osm_components, country, language, + random_key=is_default, + language_suffix=language_suffix) - self.components.normalize_place_names(address_components, osm_components, country=country, languages=all_local_languages) + self.components.normalize_place_names(address_components, osm_components, country=country, languages=all_local_languages) - place_tags.append((address_components, language, is_default)) + place_tags.append((address_components, language, is_default)) for language in random_languages - all_local_languages: language_suffix = ':{}'.format(language) @@ -467,7 +471,7 @@ class OSMAddressFormatter(object): for address_components in place_tags: address_components[AddressFormatter.POSTCODE] = random.choice(postal_codes) - return place_tags, num_references, country + return place_tags, country def category_queries(self, tags, address_components, language, country=None, tag_components=True): formatted_addresses = [] @@ -758,7 +762,7 @@ class OSMAddressFormatter(object): writer = csv.writer(formatted_file, 'tsv_no_quote') for node_id, tags, deps in parse_osm(infile): - place_tags, num_references, country = self.node_place_tags(tags) + place_tags, country = self.node_place_tags(tags) for address_components, language, is_default in place_tags: addresses = self.formatted_places(address_components, country, language) if language is None: @@ -774,8 +778,7 @@ class OSMAddressFormatter(object): else: row = (address, ) - for j in xrange(num_references if is_default else 1): - writer.writerow(row) + writer.writerow(row) i += 1 if i % 1000 == 0 and i > 0: