[addresses] Adding address-level component dropout to AddressComponents (returns an ordering so the client formatter can potentially emit multiple addresses with different components dropped out). Adding PO box and category probabilities to config
This commit is contained in:
@@ -7,6 +7,83 @@ languages:
|
||||
# Replace user-tagged admin components with the non-local language version
|
||||
replace_non_local_probability: 0.4
|
||||
|
||||
# Dependencies for including each component in an "address"
|
||||
# Two-way dependencies are not an issue
|
||||
component_dependencies:
|
||||
house:
|
||||
dependencies: []
|
||||
|
||||
road:
|
||||
dependencies:
|
||||
- house
|
||||
- house_number
|
||||
- suburb
|
||||
- city_district
|
||||
- city
|
||||
- postcode
|
||||
|
||||
house_number:
|
||||
dependencies:
|
||||
- road
|
||||
|
||||
entrance:
|
||||
dependencies:
|
||||
- house_number
|
||||
|
||||
staircase:
|
||||
dependencies:
|
||||
- house_number
|
||||
|
||||
level:
|
||||
dependencies:
|
||||
- house_number
|
||||
|
||||
unit:
|
||||
dependencies:
|
||||
- house_number
|
||||
|
||||
postcode:
|
||||
dependencies: []
|
||||
|
||||
|
||||
# Each component is dropped out separately and a new address
|
||||
# is added to the training set. These are only the address-level
|
||||
# components. Places/boundaries are taken care of elsewhere.
|
||||
dropout:
|
||||
attention:
|
||||
probability: 0.8
|
||||
care_of:
|
||||
probability: 0.8
|
||||
house:
|
||||
probability: 0.6
|
||||
house_number:
|
||||
probability: 0.5
|
||||
road:
|
||||
probability: 0.4
|
||||
entrance:
|
||||
probability: 0.8
|
||||
staircase:
|
||||
probability: 0.8
|
||||
level:
|
||||
probability: 0.6
|
||||
unit:
|
||||
probability: 0.5
|
||||
postcode:
|
||||
probability: 0.6
|
||||
|
||||
po_box:
|
||||
probability: 0.1
|
||||
# Note: these probabilities all independent (don't need to sum to 1)
|
||||
drop_address_probability: 0.8 # drop house number, road, etc.
|
||||
drop_places_probability: 0.1 # drop place names
|
||||
drop_postcode_probability: 0.3 # drop postal code
|
||||
|
||||
category:
|
||||
# Same thing for category queries
|
||||
drop_address_probability: 0.8 # drop house number, road, etc.
|
||||
drop_places_probability: 0.1 # drop place names
|
||||
drop_postcode_probability: 0.3 # drop postal code
|
||||
|
||||
neighborhood:
|
||||
# Usually in Germany, may have e.g. name:prefix=Ortsteil
|
||||
add_prefix_probability: 0.5
|
||||
|
||||
@@ -52,18 +52,18 @@ class AddressFormatter(object):
|
||||
|
||||
CATEGORY = 'category'
|
||||
NEAR = 'near'
|
||||
ATTENTION = 'attention'
|
||||
CARE_OF = 'care_of'
|
||||
HOUSE = 'house'
|
||||
HOUSE_NUMBER = 'house_number'
|
||||
PO_BOX = 'po_box'
|
||||
ATTENTION = 'attention'
|
||||
CARE_OF = 'care_of'
|
||||
ROAD = 'road'
|
||||
BUILDING = 'building'
|
||||
ENTRANCE = 'entrance'
|
||||
STAIRCASE = 'staircase'
|
||||
LEVEL = 'level'
|
||||
UNIT = 'unit'
|
||||
INTERSECTION = 'intersection'
|
||||
ROAD = 'road'
|
||||
SUBDIVISION = 'subdivision'
|
||||
SUBURB = 'suburb'
|
||||
CITY_DISTRICT = 'city_district'
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import operator
|
||||
import os
|
||||
import pycountry
|
||||
import random
|
||||
@@ -5,6 +6,7 @@ import six
|
||||
import yaml
|
||||
|
||||
from collections import defaultdict, OrderedDict
|
||||
from itertools import combinations
|
||||
|
||||
from geodata.address_formatting.formatter import AddressFormatter
|
||||
|
||||
@@ -40,8 +42,6 @@ class ComponentDependencies(object):
|
||||
a house_numer cannot be used in the absence of a road name.
|
||||
'''
|
||||
|
||||
ANY = 'any'
|
||||
ALL = 'all'
|
||||
|
||||
def __init__(self, name, dependencies=tuple()):
|
||||
self.name = name
|
||||
@@ -121,6 +121,8 @@ class AddressComponents(object):
|
||||
self.config = yaml.load(open(PARSER_DEFAULT_CONFIG))
|
||||
|
||||
self.setup_component_dependencies()
|
||||
# Non-admin component dropout
|
||||
self.address_level_dropout_probabilities = {k: v['probability'] for k, v in six.iteritems(self.config['dropout'])}
|
||||
|
||||
self.osm_admin_rtree = osm_admin_rtree
|
||||
self.language_rtree = language_rtree
|
||||
@@ -129,17 +131,64 @@ class AddressComponents(object):
|
||||
self.geonames = geonames
|
||||
|
||||
def setup_component_dependencies(self):
|
||||
self.component_dependencies = OrderedDict()
|
||||
deps = self.config.get('component_dependencies', {})
|
||||
for component, conf in six.iteritems(deps):
|
||||
dep_list = []
|
||||
for dep in conf['dependencies']:
|
||||
for k in (ComponentDependencies.ANY, ComponentDependencies.ALL):
|
||||
if k in dep:
|
||||
dep_list.append((k, dep[k]))
|
||||
break
|
||||
self.component_dependencies = {}
|
||||
self.component_bit_values = {}
|
||||
self.valid_component_bitsets = set()
|
||||
self.component_combinations = set()
|
||||
|
||||
self.component_dependencies[component] = ComponentDependencies(component, dep_list)
|
||||
forward_deps = self.config.get('component_dependencies', {})
|
||||
|
||||
for i, component in enumerate(forward_deps):
|
||||
self.component_bit_values[component] = 1 << i
|
||||
|
||||
all_values = self.component_bitset(forward_deps)
|
||||
|
||||
for component, conf in six.iteritems(forward_deps):
|
||||
deps = conf['dependencies']
|
||||
self.component_dependencies[component] = self.component_bitset(deps) if deps else all_values
|
||||
|
||||
def component_bitset(self, components):
|
||||
return reduce(operator.or_, [self.component_bit_values[c] for c in components])
|
||||
|
||||
def address_level_dropout_order(self, components):
|
||||
'''
|
||||
Address component dropout
|
||||
-------------------------
|
||||
|
||||
To make the parser more robust to different kinds of input (not every address is fully
|
||||
specified, especially in a geocoder, on mobile, with autocomplete, etc.), we want to
|
||||
train the parser with many types of addresses.
|
||||
|
||||
This will help the parser not become too reliant on component order, e.g. it won't think
|
||||
that the first token in a string is always the venue name simply because that was the case
|
||||
in the training data.
|
||||
|
||||
This method returns a dropout ordering ensuring that if the components are dropped in order,
|
||||
each set will be valid. In the parser config (resources/parser/default.yaml), the dependencies
|
||||
for each address component are specified, e.g. "house_number" depends on "road", so it would
|
||||
be invalid to have an address that was simply a house number with no other information. The
|
||||
caller of this method may decide to drop all the components at once or one at a time, creating
|
||||
N training examples from a single address.
|
||||
'''
|
||||
component_bitset = self.component_bitset(components)
|
||||
|
||||
candidates = [c for c in components if c in self.address_level_dropout_probabilities]
|
||||
random.shuffle(candidates)
|
||||
retained = set(candidates)
|
||||
|
||||
dropout_order = []
|
||||
|
||||
for component in candidates[:-1]:
|
||||
if random.random() >= self.address_level_dropout_probabilities.get(component, 0.0):
|
||||
continue
|
||||
bit_value = self.component_bit_values.get(component, 0)
|
||||
candidate_bitset = component_bitset ^ bit_value
|
||||
|
||||
if all((candidate_bitset & self.component_dependencies[c] for c in retained if c != component)):
|
||||
dropout_order.append(component)
|
||||
component_bitset = candidate_bitset
|
||||
retained.remove(component)
|
||||
return dropout_order
|
||||
|
||||
def strip_keys(self, value, ignore_keys):
|
||||
for key in ignore_keys:
|
||||
@@ -930,23 +979,12 @@ class AddressComponents(object):
|
||||
else:
|
||||
return None
|
||||
|
||||
def category_components(self, category_query, address_components, language, country=None):
|
||||
category_config = self.config['category']
|
||||
address_components[AddressFormatter.CATEGORY] = category_query.category
|
||||
if category_query.prep:
|
||||
address_components[AddressFormatter.NEAR] = category_query.prep
|
||||
|
||||
drop_address_probability = category_config['drop_address_probability']
|
||||
if random.random() < drop_address_probability:
|
||||
address_components = self.drop_address(address_components)
|
||||
|
||||
drop_postcode_probability = category_config['drop_postcode_probability']
|
||||
if random.random() < drop_postcode_probability:
|
||||
address_components = self.drop_postcode(address_components)
|
||||
|
||||
if not category_query.add_place_name:
|
||||
address_components = self.drop_places(address_components)
|
||||
return address_components
|
||||
def dropout_address_level_component(self, address_components, component):
|
||||
probability = self.address_level_dropout_probabilities.get(component, None)
|
||||
if probability is not None and random.random() < probability:
|
||||
address_components.pop(component)
|
||||
return True
|
||||
return False
|
||||
|
||||
def expanded(self, address_components, latitude, longitude,
|
||||
dropout_places=True, add_sub_building_components=True,
|
||||
@@ -1025,7 +1063,7 @@ class AddressComponents(object):
|
||||
|
||||
if dropout_places:
|
||||
# Perform dropout on places
|
||||
address_components = place_config.drop_components(address_components, all_osm_components, country=country)
|
||||
address_components = place_config.dropout_components(address_components, all_osm_components, country=country)
|
||||
|
||||
return address_components, country, language
|
||||
|
||||
|
||||
Reference in New Issue
Block a user