From 8f358d295f8d285340eef9b907fafa98dbf6f20c Mon Sep 17 00:00:00 2001 From: Al Date: Sat, 21 May 2016 17:54:30 -0400 Subject: [PATCH] [addresses] Adding address-level component dropout to AddressComponents (returns an ordering so the client formatter can potentially emit multiple addresses with different components dropped out). Adding PO box and category probabilities to config --- resources/parser/default.yaml | 77 +++++++++++++++ .../geodata/address_formatting/formatter.py | 6 +- scripts/geodata/addresses/components.py | 98 +++++++++++++------ 3 files changed, 148 insertions(+), 33 deletions(-) diff --git a/resources/parser/default.yaml b/resources/parser/default.yaml index 9ccf0449..7727983a 100644 --- a/resources/parser/default.yaml +++ b/resources/parser/default.yaml @@ -7,6 +7,83 @@ languages: # Replace user-tagged admin components with the non-local language version replace_non_local_probability: 0.4 +# Dependencies for including each component in an "address" +# Two-way dependencies are not an issue +component_dependencies: + house: + dependencies: [] + + road: + dependencies: + - house + - house_number + - suburb + - city_district + - city + - postcode + + house_number: + dependencies: + - road + + entrance: + dependencies: + - house_number + + staircase: + dependencies: + - house_number + + level: + dependencies: + - house_number + + unit: + dependencies: + - house_number + + postcode: + dependencies: [] + + +# Each component is dropped out separately and a new address +# is added to the training set. These are only the address-level +# components. Places/boundaries are taken care of elsewhere. +dropout: + attention: + probability: 0.8 + care_of: + probability: 0.8 + house: + probability: 0.6 + house_number: + probability: 0.5 + road: + probability: 0.4 + entrance: + probability: 0.8 + staircase: + probability: 0.8 + level: + probability: 0.6 + unit: + probability: 0.5 + postcode: + probability: 0.6 + +po_box: + probability: 0.1 + # Note: these probabilities all independent (don't need to sum to 1) + drop_address_probability: 0.8 # drop house number, road, etc. + drop_places_probability: 0.1 # drop place names + drop_postcode_probability: 0.3 # drop postal code + +category: + # Same thing for category queries + drop_address_probability: 0.8 # drop house number, road, etc. + drop_places_probability: 0.1 # drop place names + drop_postcode_probability: 0.3 # drop postal code + neighborhood: # Usually in Germany, may have e.g. name:prefix=Ortsteil add_prefix_probability: 0.5 diff --git a/scripts/geodata/address_formatting/formatter.py b/scripts/geodata/address_formatting/formatter.py index 25181df5..406d59e1 100644 --- a/scripts/geodata/address_formatting/formatter.py +++ b/scripts/geodata/address_formatting/formatter.py @@ -52,18 +52,18 @@ class AddressFormatter(object): CATEGORY = 'category' NEAR = 'near' + ATTENTION = 'attention' + CARE_OF = 'care_of' HOUSE = 'house' HOUSE_NUMBER = 'house_number' PO_BOX = 'po_box' - ATTENTION = 'attention' - CARE_OF = 'care_of' + ROAD = 'road' BUILDING = 'building' ENTRANCE = 'entrance' STAIRCASE = 'staircase' LEVEL = 'level' UNIT = 'unit' INTERSECTION = 'intersection' - ROAD = 'road' SUBDIVISION = 'subdivision' SUBURB = 'suburb' CITY_DISTRICT = 'city_district' diff --git a/scripts/geodata/addresses/components.py b/scripts/geodata/addresses/components.py index 6c3c0882..f2a3ed81 100644 --- a/scripts/geodata/addresses/components.py +++ b/scripts/geodata/addresses/components.py @@ -1,3 +1,4 @@ +import operator import os import pycountry import random @@ -5,6 +6,7 @@ import six import yaml from collections import defaultdict, OrderedDict +from itertools import combinations from geodata.address_formatting.formatter import AddressFormatter @@ -40,8 +42,6 @@ class ComponentDependencies(object): a house_numer cannot be used in the absence of a road name. ''' - ANY = 'any' - ALL = 'all' def __init__(self, name, dependencies=tuple()): self.name = name @@ -121,6 +121,8 @@ class AddressComponents(object): self.config = yaml.load(open(PARSER_DEFAULT_CONFIG)) self.setup_component_dependencies() + # Non-admin component dropout + self.address_level_dropout_probabilities = {k: v['probability'] for k, v in six.iteritems(self.config['dropout'])} self.osm_admin_rtree = osm_admin_rtree self.language_rtree = language_rtree @@ -129,17 +131,64 @@ class AddressComponents(object): self.geonames = geonames def setup_component_dependencies(self): - self.component_dependencies = OrderedDict() - deps = self.config.get('component_dependencies', {}) - for component, conf in six.iteritems(deps): - dep_list = [] - for dep in conf['dependencies']: - for k in (ComponentDependencies.ANY, ComponentDependencies.ALL): - if k in dep: - dep_list.append((k, dep[k])) - break + self.component_dependencies = {} + self.component_bit_values = {} + self.valid_component_bitsets = set() + self.component_combinations = set() - self.component_dependencies[component] = ComponentDependencies(component, dep_list) + forward_deps = self.config.get('component_dependencies', {}) + + for i, component in enumerate(forward_deps): + self.component_bit_values[component] = 1 << i + + all_values = self.component_bitset(forward_deps) + + for component, conf in six.iteritems(forward_deps): + deps = conf['dependencies'] + self.component_dependencies[component] = self.component_bitset(deps) if deps else all_values + + def component_bitset(self, components): + return reduce(operator.or_, [self.component_bit_values[c] for c in components]) + + def address_level_dropout_order(self, components): + ''' + Address component dropout + ------------------------- + + To make the parser more robust to different kinds of input (not every address is fully + specified, especially in a geocoder, on mobile, with autocomplete, etc.), we want to + train the parser with many types of addresses. + + This will help the parser not become too reliant on component order, e.g. it won't think + that the first token in a string is always the venue name simply because that was the case + in the training data. + + This method returns a dropout ordering ensuring that if the components are dropped in order, + each set will be valid. In the parser config (resources/parser/default.yaml), the dependencies + for each address component are specified, e.g. "house_number" depends on "road", so it would + be invalid to have an address that was simply a house number with no other information. The + caller of this method may decide to drop all the components at once or one at a time, creating + N training examples from a single address. + ''' + component_bitset = self.component_bitset(components) + + candidates = [c for c in components if c in self.address_level_dropout_probabilities] + random.shuffle(candidates) + retained = set(candidates) + + dropout_order = [] + + for component in candidates[:-1]: + if random.random() >= self.address_level_dropout_probabilities.get(component, 0.0): + continue + bit_value = self.component_bit_values.get(component, 0) + candidate_bitset = component_bitset ^ bit_value + + if all((candidate_bitset & self.component_dependencies[c] for c in retained if c != component)): + dropout_order.append(component) + component_bitset = candidate_bitset + retained.remove(component) + return dropout_order def strip_keys(self, value, ignore_keys): for key in ignore_keys: @@ -930,23 +979,12 @@ class AddressComponents(object): else: return None - def category_components(self, category_query, address_components, language, country=None): - category_config = self.config['category'] - address_components[AddressFormatter.CATEGORY] = category_query.category - if category_query.prep: - address_components[AddressFormatter.NEAR] = category_query.prep - - drop_address_probability = category_config['drop_address_probability'] - if random.random() < drop_address_probability: - address_components = self.drop_address(address_components) - - drop_postcode_probability = category_config['drop_postcode_probability'] - if random.random() < drop_postcode_probability: - address_components = self.drop_postcode(address_components) - - if not category_query.add_place_name: - address_components = self.drop_places(address_components) - return address_components + def dropout_address_level_component(self, address_components, component): + probability = self.address_level_dropout_probabilities.get(component, None) + if probability is not None and random.random() < probability: + address_components.pop(component) + return True + return False def expanded(self, address_components, latitude, longitude, dropout_places=True, add_sub_building_components=True, @@ -1025,7 +1063,7 @@ class AddressComponents(object): if dropout_places: # Perform dropout on places - address_components = place_config.drop_components(address_components, all_osm_components, country=country) + address_components = place_config.dropout_components(address_components, all_osm_components, country=country) return address_components, country, language