[addresses] Adding address-level component dropout to AddressComponents (returns an ordering so the client formatter can potentially emit multiple addresses with different components dropped out). Adding PO box and category probabilities to config

This commit is contained in:
Al
2016-05-21 17:54:30 -04:00
parent e1aec72c32
commit 8f358d295f
3 changed files with 148 additions and 33 deletions

View File

@@ -7,6 +7,83 @@ languages:
# Replace user-tagged admin components with the non-local language version
replace_non_local_probability: 0.4
# Dependencies for including each component in an "address"
# Two-way dependencies are not an issue
component_dependencies:
house:
dependencies: []
road:
dependencies:
- house
- house_number
- suburb
- city_district
- city
- postcode
house_number:
dependencies:
- road
entrance:
dependencies:
- house_number
staircase:
dependencies:
- house_number
level:
dependencies:
- house_number
unit:
dependencies:
- house_number
postcode:
dependencies: []
# Each component is dropped out separately and a new address
# is added to the training set. These are only the address-level
# components. Places/boundaries are taken care of elsewhere.
dropout:
attention:
probability: 0.8
care_of:
probability: 0.8
house:
probability: 0.6
house_number:
probability: 0.5
road:
probability: 0.4
entrance:
probability: 0.8
staircase:
probability: 0.8
level:
probability: 0.6
unit:
probability: 0.5
postcode:
probability: 0.6
po_box:
probability: 0.1
# Note: these probabilities all independent (don't need to sum to 1)
drop_address_probability: 0.8 # drop house number, road, etc.
drop_places_probability: 0.1 # drop place names
drop_postcode_probability: 0.3 # drop postal code
category:
# Same thing for category queries
drop_address_probability: 0.8 # drop house number, road, etc.
drop_places_probability: 0.1 # drop place names
drop_postcode_probability: 0.3 # drop postal code
neighborhood:
# Usually in Germany, may have e.g. name:prefix=Ortsteil
add_prefix_probability: 0.5

View File

@@ -52,18 +52,18 @@ class AddressFormatter(object):
CATEGORY = 'category'
NEAR = 'near'
ATTENTION = 'attention'
CARE_OF = 'care_of'
HOUSE = 'house'
HOUSE_NUMBER = 'house_number'
PO_BOX = 'po_box'
ATTENTION = 'attention'
CARE_OF = 'care_of'
ROAD = 'road'
BUILDING = 'building'
ENTRANCE = 'entrance'
STAIRCASE = 'staircase'
LEVEL = 'level'
UNIT = 'unit'
INTERSECTION = 'intersection'
ROAD = 'road'
SUBDIVISION = 'subdivision'
SUBURB = 'suburb'
CITY_DISTRICT = 'city_district'

View File

@@ -1,3 +1,4 @@
import operator
import os
import pycountry
import random
@@ -5,6 +6,7 @@ import six
import yaml
from collections import defaultdict, OrderedDict
from itertools import combinations
from geodata.address_formatting.formatter import AddressFormatter
@@ -40,8 +42,6 @@ class ComponentDependencies(object):
a house_numer cannot be used in the absence of a road name.
'''
ANY = 'any'
ALL = 'all'
def __init__(self, name, dependencies=tuple()):
self.name = name
@@ -121,6 +121,8 @@ class AddressComponents(object):
self.config = yaml.load(open(PARSER_DEFAULT_CONFIG))
self.setup_component_dependencies()
# Non-admin component dropout
self.address_level_dropout_probabilities = {k: v['probability'] for k, v in six.iteritems(self.config['dropout'])}
self.osm_admin_rtree = osm_admin_rtree
self.language_rtree = language_rtree
@@ -129,17 +131,64 @@ class AddressComponents(object):
self.geonames = geonames
def setup_component_dependencies(self):
self.component_dependencies = OrderedDict()
deps = self.config.get('component_dependencies', {})
for component, conf in six.iteritems(deps):
dep_list = []
for dep in conf['dependencies']:
for k in (ComponentDependencies.ANY, ComponentDependencies.ALL):
if k in dep:
dep_list.append((k, dep[k]))
break
self.component_dependencies = {}
self.component_bit_values = {}
self.valid_component_bitsets = set()
self.component_combinations = set()
self.component_dependencies[component] = ComponentDependencies(component, dep_list)
forward_deps = self.config.get('component_dependencies', {})
for i, component in enumerate(forward_deps):
self.component_bit_values[component] = 1 << i
all_values = self.component_bitset(forward_deps)
for component, conf in six.iteritems(forward_deps):
deps = conf['dependencies']
self.component_dependencies[component] = self.component_bitset(deps) if deps else all_values
def component_bitset(self, components):
return reduce(operator.or_, [self.component_bit_values[c] for c in components])
def address_level_dropout_order(self, components):
'''
Address component dropout
-------------------------
To make the parser more robust to different kinds of input (not every address is fully
specified, especially in a geocoder, on mobile, with autocomplete, etc.), we want to
train the parser with many types of addresses.
This will help the parser not become too reliant on component order, e.g. it won't think
that the first token in a string is always the venue name simply because that was the case
in the training data.
This method returns a dropout ordering ensuring that if the components are dropped in order,
each set will be valid. In the parser config (resources/parser/default.yaml), the dependencies
for each address component are specified, e.g. "house_number" depends on "road", so it would
be invalid to have an address that was simply a house number with no other information. The
caller of this method may decide to drop all the components at once or one at a time, creating
N training examples from a single address.
'''
component_bitset = self.component_bitset(components)
candidates = [c for c in components if c in self.address_level_dropout_probabilities]
random.shuffle(candidates)
retained = set(candidates)
dropout_order = []
for component in candidates[:-1]:
if random.random() >= self.address_level_dropout_probabilities.get(component, 0.0):
continue
bit_value = self.component_bit_values.get(component, 0)
candidate_bitset = component_bitset ^ bit_value
if all((candidate_bitset & self.component_dependencies[c] for c in retained if c != component)):
dropout_order.append(component)
component_bitset = candidate_bitset
retained.remove(component)
return dropout_order
def strip_keys(self, value, ignore_keys):
for key in ignore_keys:
@@ -930,23 +979,12 @@ class AddressComponents(object):
else:
return None
def category_components(self, category_query, address_components, language, country=None):
category_config = self.config['category']
address_components[AddressFormatter.CATEGORY] = category_query.category
if category_query.prep:
address_components[AddressFormatter.NEAR] = category_query.prep
drop_address_probability = category_config['drop_address_probability']
if random.random() < drop_address_probability:
address_components = self.drop_address(address_components)
drop_postcode_probability = category_config['drop_postcode_probability']
if random.random() < drop_postcode_probability:
address_components = self.drop_postcode(address_components)
if not category_query.add_place_name:
address_components = self.drop_places(address_components)
return address_components
def dropout_address_level_component(self, address_components, component):
probability = self.address_level_dropout_probabilities.get(component, None)
if probability is not None and random.random() < probability:
address_components.pop(component)
return True
return False
def expanded(self, address_components, latitude, longitude,
dropout_places=True, add_sub_building_components=True,
@@ -1025,7 +1063,7 @@ class AddressComponents(object):
if dropout_places:
# Perform dropout on places
address_components = place_config.drop_components(address_components, all_osm_components, country=country)
address_components = place_config.dropout_components(address_components, all_osm_components, country=country)
return address_components, country, language