From 769a65b808deb407b4e2f2f3130fefd9eee1670b Mon Sep 17 00:00:00 2001 From: Al Date: Thu, 8 Sep 2016 03:17:21 -0400 Subject: [PATCH] [openaddresses] adding place-only and place+postcode probability to OpenAddresses to capture more place names not in OSM as standalone queries --- resources/parser/data_sets/openaddresses.yaml | 3 +++ scripts/geodata/openaddresses/formatter.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/resources/parser/data_sets/openaddresses.yaml b/resources/parser/data_sets/openaddresses.yaml index 0e742c5d..6c884975 100644 --- a/resources/parser/data_sets/openaddresses.yaml +++ b/resources/parser/data_sets/openaddresses.yaml @@ -9,6 +9,9 @@ global: abbreviate_unit_probability: 0.3 separate_unit_probability: 0.2 + place_only_probability: 0.2 + place_and_postcode_probability: 0.1 + fields: &default_fields - field_name: NUMBER component: house_number diff --git a/scripts/geodata/openaddresses/formatter.py b/scripts/geodata/openaddresses/formatter.py index 21e71f3d..6fafcacf 100644 --- a/scripts/geodata/openaddresses/formatter.py +++ b/scripts/geodata/openaddresses/formatter.py @@ -250,6 +250,11 @@ class OpenAddressesFormatter(object): numeric_postcodes_only = bool(self.get_property('numeric_postcodes_only', *configs) or False) postcode_strip_non_digit_chars = bool(self.get_property('postcode_strip_non_digit_chars', *configs) or False) + place_only_probability = float(self.get_property('place_only_probability', *configs)) + place_and_postcode_probability = float(self.get_property('place_and_postcode_probability', *configs)) + + drop_address_probability = place_only_probability + place_and_postcode_probability + ignore_rows_missing_fields = set(self.get_property('ignore_rows_missing_fields', *configs) or []) ignore_fields_containing = {field: re.compile(six.u('|').join([six.u('(?:{})').format(safe_decode(v)) for v in value]), re.I | re.UNICODE) @@ -463,6 +468,18 @@ class OpenAddressesFormatter(object): minimal_only=False, tag_components=tag_components) yield (language, country, formatted) + rand_val = random.random() + + if street and house_number and rand_val < drop_address_probability: + components = self.components.drop_address(components) + + if rand_val < place_and_postcode_probability: + components = self.components.drop_postcode(components) + + formatted = self.formatter.format_address(components, country, language=language, + minimal_only=False, tag_components=tag_components) + yield (language, country, formatted) + def build_training_data(self, base_dir, out_dir, tag_components=True): if tag_components: formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME), 'w')