[openaddresses] adding place-only and place+postcode probability to OpenAddresses to capture more place names not in OSM as standalone queries
This commit is contained in:
@@ -9,6 +9,9 @@ global:
|
|||||||
abbreviate_unit_probability: 0.3
|
abbreviate_unit_probability: 0.3
|
||||||
separate_unit_probability: 0.2
|
separate_unit_probability: 0.2
|
||||||
|
|
||||||
|
place_only_probability: 0.2
|
||||||
|
place_and_postcode_probability: 0.1
|
||||||
|
|
||||||
fields: &default_fields
|
fields: &default_fields
|
||||||
- field_name: NUMBER
|
- field_name: NUMBER
|
||||||
component: house_number
|
component: house_number
|
||||||
|
|||||||
@@ -250,6 +250,11 @@ class OpenAddressesFormatter(object):
|
|||||||
numeric_postcodes_only = bool(self.get_property('numeric_postcodes_only', *configs) or False)
|
numeric_postcodes_only = bool(self.get_property('numeric_postcodes_only', *configs) or False)
|
||||||
postcode_strip_non_digit_chars = bool(self.get_property('postcode_strip_non_digit_chars', *configs) or False)
|
postcode_strip_non_digit_chars = bool(self.get_property('postcode_strip_non_digit_chars', *configs) or False)
|
||||||
|
|
||||||
|
place_only_probability = float(self.get_property('place_only_probability', *configs))
|
||||||
|
place_and_postcode_probability = float(self.get_property('place_and_postcode_probability', *configs))
|
||||||
|
|
||||||
|
drop_address_probability = place_only_probability + place_and_postcode_probability
|
||||||
|
|
||||||
ignore_rows_missing_fields = set(self.get_property('ignore_rows_missing_fields', *configs) or [])
|
ignore_rows_missing_fields = set(self.get_property('ignore_rows_missing_fields', *configs) or [])
|
||||||
|
|
||||||
ignore_fields_containing = {field: re.compile(six.u('|').join([six.u('(?:{})').format(safe_decode(v)) for v in value]), re.I | re.UNICODE)
|
ignore_fields_containing = {field: re.compile(six.u('|').join([six.u('(?:{})').format(safe_decode(v)) for v in value]), re.I | re.UNICODE)
|
||||||
@@ -463,6 +468,18 @@ class OpenAddressesFormatter(object):
|
|||||||
minimal_only=False, tag_components=tag_components)
|
minimal_only=False, tag_components=tag_components)
|
||||||
yield (language, country, formatted)
|
yield (language, country, formatted)
|
||||||
|
|
||||||
|
rand_val = random.random()
|
||||||
|
|
||||||
|
if street and house_number and rand_val < drop_address_probability:
|
||||||
|
components = self.components.drop_address(components)
|
||||||
|
|
||||||
|
if rand_val < place_and_postcode_probability:
|
||||||
|
components = self.components.drop_postcode(components)
|
||||||
|
|
||||||
|
formatted = self.formatter.format_address(components, country, language=language,
|
||||||
|
minimal_only=False, tag_components=tag_components)
|
||||||
|
yield (language, country, formatted)
|
||||||
|
|
||||||
def build_training_data(self, base_dir, out_dir, tag_components=True):
|
def build_training_data(self, base_dir, out_dir, tag_components=True):
|
||||||
if tag_components:
|
if tag_components:
|
||||||
formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME), 'w')
|
formatted_tagged_file = open(os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME), 'w')
|
||||||
|
|||||||
Reference in New Issue
Block a user